In [1]:

import tensorflow_text as _

import tensorflow as tf
import tensorflow_hub as hub
from official.nlp import optimization  # to create AdamW optimizer
from sklearn.preprocessing import OneHotEncoder

import json
import numpy as np
import pickle 

gpu = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu[0], True)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
export = json.load(open('assets/export_large.json', encoding='utf-8'))
ds = [(x['author']['name'], x['content']) for x in export['messages']]
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1"
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"


In [43]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='bert_encoder')
    outputs = encoder(encoder_inputs)

    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(6, activation='softmax', name='multi_classifier')(net)
    return tf.keras.Model(text_input, net)


In [44]:
model = build_classifier_model()

In [5]:
encoder = OneHotEncoder(handle_unknown='ignore')

labels = encoder.fit_transform([[x[0]] for x in ds]).toarray()
messages = np.array([x[1] for x in ds])

ds_size = len(labels)


features = tf.data.Dataset.from_tensor_slices(messages)
labels = tf.data.Dataset.from_tensor_slices(labels)
train_ds = tf.data.Dataset.zip((features, labels)).batch(32)
train_ds = train_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)


In [46]:
loss = tf.keras.losses.CategoricalCrossentropy()
metrics = tf.metrics.CategoricalAccuracy()

epochs = 10
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1 * num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                            num_train_steps=num_train_steps,
                                            num_warmup_steps=num_warmup_steps,
                                            optimizer_type='adamw')

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)


In [47]:
history = model.fit(x=train_ds, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [95]:
def make_prediction(t: str):
    pred = model.predict([t])
    r = np.argmax(pred)
    mask = np.zeros(shape=(6,))
    mask[r] = 1
    return encoder.inverse_transform([mask]), pred[0][r]

target, confidence = make_prediction("Spam")
print(f'Predicted {target} with confidence {confidence * 100:.2f}%')

Predicted [['- MrChacocha -']] with confidence 47.20%
