# How to train a REWNet and export it as a compatible RetVec tokenizer model.
FIXME more info

In [1]:
 %load_ext autoreload
 %autoreload 2

In [2]:
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from termcolor import cprint
from collections import defaultdict
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

from retvec.rewnet import REWCNN, REWBert, REWTCN, REWMix
from retvec.utils import get_random_unicode
from retvec.utils import tf_cap_memory

In [3]:
tf_cap_memory()

In [4]:
max_len = 16
decoder_size = 256
batch_size = 128

In [5]:
class DataGenerator(tf.keras.utils.Sequence):

    def __init__(self, max_len, batch_size, decoder_size):
        self.max_len = max_len
        self.batch_size = batch_size
        self.decoder_size = decoder_size

    def __len__(self):
        'Denotes the number of batches per epoch'
        return 100000

    def __getitem__(self, index):
        'Generate one batch of data'
        x = [get_random_unicode(self.max_len) for _ in range(self.batch_size)]
        y = []
        for s in x:
            y.append(tf.one_hot([ord(c) for c in s], self.decoder_size))
        return np.array(x), np.array(y)

In [6]:
train_gen = DataGenerator(max_len=max_len, batch_size=batch_size, decoder_size=decoder_size)
test_gen = DataGenerator(max_len=max_len, batch_size=batch_size, decoder_size=decoder_size)


In [7]:
arch = "bert"  # mix, bert, cnn, tcn

if arch == "bert":
    model = REWBert(decoder_size=decoder_size)
    save_path  = "../tmp/bert_auto_model"
elif arch == "mix":
    model = REWMix(decoder_size=decoder_size)
    save_path  = "../tmp/mix_auto_model"
elif arch == "cnn":
    model = REWCNN(decoder_size=decoder_size)
    save_path  = "../tmp/cnn_auto_model"
else:
    raise ValueError("not implemented")
model.summary()

Output size extended by 1 to inject CLS token
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
token (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
rec_vec (RecVec)                (None, 17, 16)       0           token[0][0]                      
__________________________________________________________________________________________________
encoder_start (Dense)           (None, 17, 16)       272         rec_vec[0][0]                    
__________________________________________________________________________________________________
layer_normalization (LayerNorma (None, 17, 16)       32          encoder_start[0][0]              
________________________________________________

In [8]:
epochs = 5
steps_per_epochs = 1000
validation_steps = 50
model.compile(Adam(0.002), 'binary_crossentropy')
history = model.fit(train_gen, epochs=epochs, steps_per_epoch=steps_per_epochs, validation_steps=validation_steps,
          validation_data=test_gen)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# bert sigmoid
Epoch 1/5
1000/1000 [==============================] - 47s 42ms/step - loss: 0.0060 - val_loss: 2.7909e-04
Epoch 2/5
1000/1000 [==============================] - 41s 40ms/step - loss: 2.5774e-04 - val_loss: 2.3399e-04
Epoch 3/5
1000/1000 [==============================] - 41s 41ms/step - loss: 2.3637e-04 - val_loss: 2.4041e-04
Epoch 4/5
1000/1000 [==============================] - 40s 40ms/step - loss: 2.4011e-04 - val_loss: 2.2581e-04
Epoch 5/5
1000/1000 [==============================] - 40s 40ms/step - loss: 2.2561e-04 - val_loss: 1.8994e-04


# sigmoid mixet
Epoch 1/3
1000/1000 [==============================] - 44s 41ms/step - loss: 0.0058 - val_loss: 2.8256e-04
Epoch 2/3
1000/1000 [==============================] - 40s 40ms/step - loss: 2.5486e-04 - val_loss: 2.3813e-04
Epoch 3/3
1000/1000 [==============================] - 41s 41ms/step - loss: 2.4014e-04 - val_loss: 2.2840e-04


# tanh
Epoch 1/3
1000/1000 [==============================] - 43s 37ms/step - loss: 0.0056 - val_loss: 2.8310e-04
Epoch 2/3
1000/1000 [==============================] - 36s 36ms/step - loss: 2.4773e-04 - val_loss: 2.4905e-04
Epoch 3/3
1000/1000 [==============================] - 38s 38ms/step - loss: 2.3715e-04 - val_loss: 2.3028e-04


In [9]:
"save the encoder part as tokenizer"
tokenizer = tf.keras.Model(model.input, model.get_layer('tokenizer').output)
tokenizer.compile('adam')
tokenizer.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
token (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
rec_vec (RecVec)                (None, 17, 16)       0           token[0][0]                      
__________________________________________________________________________________________________
encoder_start (Dense)           (None, 17, 16)       272         rec_vec[0][0]                    
__________________________________________________________________________________________________
layer_normalization (LayerNorma (None, 17, 16)       32          encoder_start[0][0]              
____________________________________________________________________________________________

In [10]:
tokenizer.save(save_path)



INFO:tensorflow:Assets written to: ../tmp/bert_auto_model\assets


INFO:tensorflow:Assets written to: ../tmp/bert_auto_model\assets
