In [None]:
%load_ext autoreload
%autoreload 2
%load_ext nb_black

In [None]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [None]:
try:
    %tensorflow_version 2.x
except Exception:
    pass

In [None]:
SETUP = False

In [None]:
if SETUP:
    !pip install -q -U toai
    !pip install -q -U nb_black
    !pip install -q -U tensorflow-datasets
    !pip install -q -U --no-deps tensorflow-addons
    !pip install -q -U tensorflow_hub
    print(__import__("toai").__version__)
    print(__import__("tensorflow").__version__)

In [None]:
print(__import__("toai").__version__)

In [None]:
from toai.imports import *
from toai.utils import save_file, load_file
from toai.data import DataContainer, DataBundle
from toai.metrics import sparse_top_2_categorical_accuracy
from toai.models import save_keras_model, load_keras_model
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub

In [None]:
DATA_DIR = Path("data/womens-ecommerce-clothing-reviews")
TEMP_DIR = Path("temp/womens-ecommerce-clothing-reviews")

In [None]:
if SETUP:
    shutil.rmtree(str(DATA_DIR), ignore_errors=True)
    shutil.rmtree(str(TEMP_DIR), ignore_errors=True)
    DATA_DIR.mkdir(parents=True)
    TEMP_DIR.mkdir(parents=True)
    kaggle.api.authenticate()
    kaggle.api.dataset_download_files(
        dataset="nicapotato/womens-ecommerce-clothing-reviews",
        path=DATA_DIR,
        unzip=True,
    )

In [None]:
BATCH_SIZE = 32
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
all_data = pd.read_csv(
    DATA_DIR / "Womens Clothing E-Commerce Reviews.csv", low_memory=False
)

In [None]:
all_data.info()

In [None]:
all_data.describe(include="all")

In [None]:
all_data.head()

In [None]:
all_data["Rating"].value_counts()

In [None]:
def drop_values(df, col_name, values):
    return df.loc[~df[col_name].isin(values), :].reset_index(drop=True)

In [None]:
def drop_rare_values(df, col_name, threshold):
    counts = df[col_name].value_counts(normalize=True)
    return df.loc[df[col_name].isin(counts[counts > threshold].index), :].reset_index(
        drop=True
    )

In [None]:
available_data = all_data[~all_data["Review Text"].isna()]

In [None]:
available_data.info()

In [None]:
available_data_bundle = DataBundle.from_dataframe(
    dataframe=available_data, x_col="Review Text", y_col="Rating"
)

In [None]:
train_data, valid_data, test_data = DataBundle.split(
    data_bundle=available_data_bundle, fracs=(0.8, 0.1, 0.1), random=False
)

In [None]:
label_map = train_data.make_label_map()

In [None]:
label_map

In [None]:
save_file(label_map, TEMP_DIR / "label_map.pickle")

In [None]:
label_map = load_file(TEMP_DIR / "label_map.pickle")

In [None]:
label_map

In [None]:
train_data.apply_label_map(label_map)

In [None]:
valid_data.apply_label_map(label_map)

In [None]:
test_data.apply_label_map(label_map)

In [None]:
train_data.value_counts()

In [None]:
# {0: 653, 1: 1241, 2: 2278, 3: 3861, 4: 10080}

In [None]:
class_weights = dict(
    enumerate(
        sk.utils.class_weight.compute_class_weight(
            "balanced", np.unique(train_data.y), train_data.y
        )
    )
)

In [None]:
class_weights

In [None]:
train_data = DataBundle.from_unbalanced(train_data, 5000, train_data.value_counts())

In [None]:
train_data.value_counts()

In [None]:
@attr.s(auto_attribs=True)
class TextPreprocessor:
    max_length: int = 100
    default_value: str = b"<pad>"

    def __call__(self, text: tf.Tensor, label: tf.Tensor) -> tf.Tensor:
        text = tf.strings.regex_replace(text, b"[^a-zA-Z']", b" ")
        text = tf.strings.split(text)
        text = text[:, : self.max_length]
        return text.to_tensor(default_value=self.default_value), label

In [None]:
base_dataset = (
    train_data.to_dataset()
    .shuffle(len(train_data))
    .batch(BATCH_SIZE)
    .map(TextPreprocessor(), num_parallel_calls=AUTOTUNE)
)

In [None]:
def make_vocabulary(dataset):
    vocabulary = Counter()
    for x, _ in dataset:
        for review in x:
            vocabulary.update(review.numpy().tolist())
    return vocabulary

In [None]:
vocabulary = make_vocabulary(base_dataset)

In [None]:
len(vocabulary)

In [None]:
vocabulary.most_common()[:10]

In [None]:
VOCABULARY_SIZE = 9000

In [None]:
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:VOCABULARY_SIZE]
]

In [None]:
len(truncated_vocabulary)

In [None]:
word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}

In [None]:
for word in b"it was the most amazing dress".split():
    print(word_to_id.get(word) if word_to_id.get(word) is not None else VOCABULARY_SIZE)

In [None]:
words = tf.constant(truncated_vocabulary)

In [None]:
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)

In [None]:
n_oov_buckets = VOCABULARY_SIZE // 10

In [None]:
n_oov_buckets

In [None]:
table = tf.lookup.StaticVocabularyTable(
    tf.lookup.KeyValueTensorInitializer(words, word_ids), n_oov_buckets
)

In [None]:
table.lookup(tf.constant([b"These shoes are very patogus".split()]))

In [None]:
@attr.s(auto_attribs=True)
class WordEncoder:
    vocabulary_table: tf.lookup.StaticVocabularyTable

    def __call__(self, text: tf.Tensor, labels: tf.Tensor) -> tf.Tensor:
        return self.vocabulary_table.lookup(text), labels

In [None]:
train_dataset = (
    train_data.to_dataset()
    .shuffle(len(train_data))
    .batch(BATCH_SIZE)
    .map(TextPreprocessor(), num_parallel_calls=AUTOTUNE)
    .map(WordEncoder(vocabulary_table=table), num_parallel_calls=AUTOTUNE)
    .cache()
    .repeat()
    .prefetch(AUTOTUNE)
)

In [None]:
valid_dataset = (
    valid_data.to_dataset()
    .batch(BATCH_SIZE)
    .map(TextPreprocessor(), num_parallel_calls=AUTOTUNE)
    .map(WordEncoder(vocabulary_table=table), num_parallel_calls=AUTOTUNE)
    .cache()
    .prefetch(AUTOTUNE)
)

In [None]:
test_dataset = (
    test_data.to_dataset()
    .batch(BATCH_SIZE)
    .map(TextPreprocessor(), num_parallel_calls=AUTOTUNE)
    .map(WordEncoder(vocabulary_table=table), num_parallel_calls=AUTOTUNE)
    .cache()
    .prefetch(AUTOTUNE)
)

In [None]:
data_container = DataContainer(
    base=train_dataset,
    train=train_dataset,
    train_steps=math.ceil(len(train_data) / BATCH_SIZE),
    validation=valid_dataset,
    test=test_dataset,
    label_map=label_map,
)

In [None]:
for x, y in data_container.train.take(1):
    print(x.shape)
    print(y.shape)
    print(x[0])
    print(y[0])

In [None]:
for x, y in data_container.validation.take(1):
    print(x.shape)
    print(y.shape)
    print(x[0])
    print(y[0])

In [None]:
def make_sequential_lstm_model(
    n_categories, embedding_size, lstm_size, lstm_dropout, dropout
):
    return keras.models.Sequential(
        [
            keras.layers.Embedding(
                VOCABULARY_SIZE + n_oov_buckets,
                embedding_size,
                mask_zero=True,
                input_shape=[None],
            ),
            keras.layers.Bidirectional(
                keras.layers.LSTM(
                    lstm_size, dropout=lstm_dropout, return_sequences=True
                )
            ),
            keras.layers.Bidirectional(
                keras.layers.LSTM(
                    lstm_size, dropout=lstm_dropout, return_sequences=False
                )
            ),
            #             keras.layers.GlobalMaxPool1D(),
            keras.layers.Dropout(dropout),
            keras.layers.Dense(n_categories, activation=keras.activations.softmax),
        ]
    )

In [None]:
model = make_sequential_lstm_model(
    n_categories=data_container.n_classes,
    embedding_size=256,
    lstm_size=256,
    lstm_dropout=0.1,
    dropout=0.5,
)

In [None]:
model.compile(
    loss=keras.losses.sparse_categorical_crossentropy,
    optimizer=keras.optimizers.Adam(lr=3e-4),
    metrics=[
        keras.metrics.sparse_categorical_accuracy,
        sparse_top_2_categorical_accuracy,
    ],
)

In [None]:
history = model.fit(
    data_container.train,
    steps_per_epoch=data_container.train_steps // 5,
    validation_data=data_container.validation,
    epochs=20,
    callbacks=[
        keras.callbacks.ReduceLROnPlateau(patience=1, factor=0.3),
        keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    ],
)

In [None]:
model.evaluate(data_container.validation)

In [None]:
print(
    classification_report(
        np.concatenate([y.numpy() for _, y in data_container.validation]),
        model.predict(data_container.validation).argmax(axis=1),
    )
)

In [None]:
train_dataset = (
    train_data.to_dataset()
    .shuffle(len(train_data))
    .batch(BATCH_SIZE)
    .cache()
    .repeat()
    .prefetch(AUTOTUNE)
)

In [None]:
valid_dataset = valid_data.to_dataset().batch(BATCH_SIZE).cache().prefetch(AUTOTUNE)

In [None]:
test_dataset = test_data.to_dataset().batch(BATCH_SIZE).cache().prefetch(AUTOTUNE)

In [None]:
data_container = DataContainer(
    base=train_dataset,
    train=train_dataset,
    train_steps=math.ceil(len(train_data) / BATCH_SIZE),
    validation=valid_dataset,
    test=test_dataset,
    label_map=label_map,
)

In [None]:
def train_model(
    model,
    data_container,
    epochs,
    lrs=None,
    optimizers=None,
    patience=5,
    class_weights=None,
    verbose=1,
    log_dir=str(TEMP_DIR / "logs"),
):
    if optimizers is None:
        optimizers = [keras.optimizers.Adam(lr) for lr in lrs]
    model.compile(
        loss=keras.losses.sparse_categorical_crossentropy,
        optimizer=optimizers[0],
        metrics=[
            keras.metrics.sparse_categorical_accuracy,
            sparse_top_2_categorical_accuracy,
        ],
    )
    model.fit(
        data_container.train,
        steps_per_epoch=data_container.train_steps,
        validation_data=data_container.validation,
        epochs=epochs[0],
        callbacks=[
            keras.callbacks.ReduceLROnPlateau(patience=patience, factor=0.3),
            keras.callbacks.EarlyStopping(patience=patience, restore_best_weights=True),
        ],
        class_weight=class_weights,
        verbose=verbose,
    )
    model.layers[0].trainable = True
    model.compile(
        loss=keras.losses.sparse_categorical_crossentropy,
        optimizer=optimizers[1],
        metrics=[
            keras.metrics.sparse_categorical_accuracy,
            sparse_top_2_categorical_accuracy,
        ],
    )
    model.fit(
        data_container.train,
        steps_per_epoch=data_container.train_steps,
        validation_data=data_container.validation,
        epochs=epochs[1],
        callbacks=[
            keras.callbacks.ReduceLROnPlateau(patience=patience // 2, factor=0.3),
            keras.callbacks.EarlyStopping(patience=patience, restore_best_weights=True),
            keras.callbacks.TensorBoard(log_dir=log_dir),
        ],
        class_weight=class_weights,
        verbose=verbose,
    )

In [None]:
def make_hub_model(url, n_categories):
    return keras.Sequential(
        [
            hub.KerasLayer(url, dtype=tf.string, input_shape=[]),
            keras.layers.Dropout(0.5),
            keras.layers.Dense(n_categories, activation=keras.activations.softmax),
        ]
    )

In [None]:
def run_models(urls, data_container, class_weights):
    for url in urls:
        model = make_hub_model(url, data_container.n_classes)
        model_name = f"{url.split('/')[4]}"
        print(f" {model_name} ".center(80, "="))
        shutil.rmtree(str(TEMP_DIR / model_name), ignore_errors=True)
        train_model(
            model=model,
            data_container=data_container,
            epochs=[25, 15],
            optimizers=[keras.optimizers.Adam(lr=3e-4), keras.optimizers.Adam(lr=1e-4)],
            class_weights=class_weights,
            patience=4,
            verbose=2,
            log_dir=str(TEMP_DIR / model_name),
        )
        model.save(f"{TEMP_DIR / model_name}.h5")
        save_keras_model(
            model,
            str(TEMP_DIR / model_name / "architecture"),
            str(TEMP_DIR / model_name / "weights"),
        )
        keras.backend.clear_session()
        del model
        keras.backend.clear_session()

In [None]:
def evaluate_models(urls, data_container):
    reports = {}
    for url in urls:
        model_name = f"{url.split('/')[4]}"
        print(f" {model_name} ".center(80, "="))
        try:
            model = keras.model.load_model(
                f"{TEMP_DIR / model_name}.h5",
                custom_objects={"KerasLayer": hub.KerasLayer},
            )
        except:
            print(f"Loading architecture & weights separately")
            model = load_keras_model(
                str(TEMP_DIR / model_name / "architecture"),
                str(TEMP_DIR / model_name / "weights"),
                custom_objects={"KerasLayer": hub.KerasLayer},
            )
        reports[model_name] = classification_report(
            [
                label.numpy()
                for _, label in data_container.validation.take(-1).unbatch()
            ],
            model.predict(data_container.validation).argmax(axis=1),
        )
        del model
    return reports

In [None]:
model_urls = (
    "https://tfhub.dev/google/Wiki-words-250/2",
    "https://tfhub.dev/google/Wiki-words-250-with-normalization/2",
    "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2",
)

In [None]:
run_models(model_urls, data_container, class_weights)

In [None]:
reports = evaluate_models(model_urls, data_container)

In [None]:
for model_name, report in reports.items():
    print(f" {model_name} ".center(80, "="))
    print(report)

In [None]:
# ================================ Wiki-words-250 ================================
#               precision    recall  f1-score   support

#            0       0.54      0.81      0.65        77
#            1       0.52      0.68      0.59       159
#            2       0.54      0.60      0.57       303
#            3       0.48      0.54      0.51       500
#            4       0.86      0.73      0.79      1226

#     accuracy                           0.67      2265
#    macro avg       0.59      0.67      0.62      2265
# weighted avg       0.70      0.67      0.68      2265

# ====================== Wiki-words-250-with-normalization =======================
#               precision    recall  f1-score   support

#            0       0.63      0.83      0.72        77
#            1       0.55      0.69      0.61       159
#            2       0.53      0.61      0.57       303
#            3       0.50      0.57      0.53       500
#            4       0.87      0.75      0.81      1226

#     accuracy                           0.69      2265
#    macro avg       0.62      0.69      0.65      2265
# weighted avg       0.71      0.69      0.70      2265

# ====================== nnlm-en-dim128-with-normalization =======================
#               precision    recall  f1-score   support

#            0       0.61      0.86      0.71        77
#            1       0.56      0.69      0.62       159
#            2       0.54      0.59      0.56       303
#            3       0.49      0.56      0.52       500
#            4       0.87      0.74      0.80      1226

#     accuracy                           0.68      2265
#    macro avg       0.61      0.69      0.64      2265
# weighted avg       0.71      0.68      0.69      2265

In [None]:
# ================================ Wiki-words-250 ================================
#               precision    recall  f1-score   support

#            0       0.54      0.81      0.65        77
#            1       0.52      0.68      0.59       159
#            2       0.54      0.60      0.57       303
#            3       0.48      0.54      0.51       500
#            4       0.86      0.73      0.79      1226

#     accuracy                           0.67      2265
#    macro avg       0.59      0.67      0.62      2265
# weighted avg       0.70      0.67      0.68      2265

# ====================== Wiki-words-250-with-normalization =======================
#               precision    recall  f1-score   support

#            0       0.63      0.83      0.72        77
#            1       0.55      0.69      0.61       159
#            2       0.53      0.61      0.57       303
#            3       0.50      0.57      0.53       500
#            4       0.87      0.75      0.81      1226

#     accuracy                           0.69      2265
#    macro avg       0.62      0.69      0.65      2265
# weighted avg       0.71      0.69      0.70      2265

# ====================== nnlm-en-dim128-with-normalization =======================
#               precision    recall  f1-score   support

#            0       0.61      0.86      0.71        77
#            1       0.56      0.69      0.62       159
#            2       0.54      0.59      0.56       303
#            3       0.49      0.56      0.52       500
#            4       0.87      0.74      0.80      1226

#     accuracy                           0.68      2265
#    macro avg       0.61      0.69      0.64      2265
# weighted avg       0.71      0.68      0.69      2265