In [1]:
%load_ext autoreload

In [375]:
import os
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
assert tf.config.list_physical_devices('GPU')

import collections

import dataset

%autoreload
import transformer

In [376]:
BATCH_SIZE = 32

MAXLEN = 50

LETTERS_SIZE = len(dataset.letters_table)
NIQQUD_SIZE = len(dataset.niqqud_table)
DAGESH_SIZE = len(dataset.dagesh_table)
SIN_SIZE = len(dataset.sin_table)

d_model = 102

model = transformer.Transformer(
    num_layers=2,
    d_model=d_model,
    num_heads=6,
    dff=2048,
    input_vocab_size=LETTERS_SIZE,
    target_vocab_size=NIQQUD_SIZE, 
    maximum_position_encoding_input=MAXLEN,
    maximum_position_encoding_target=MAXLEN,
    rate=0.0
)

learning_rate = transformer.CustomSchedule(d_model, warmup_steps=3000)
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss='sparse_categorical_crossentropy',  # transformer.MaskedCategoricalCrossentropy(),  # tf.keras.losses.sparse_categorical_crossentropy,
    metrics=['accuracy']  # unmasked, so incorrect
)
# pseudo "build" step, to allow printing a summary:
# model.run_eagerly = True
h = model.pseudo_build(MAXLEN, MAXLEN)
model.summary()
model.save_weights('./checkpoints/uninit')

Model: "transformer_35"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_35 (Encoder)         multiple                  929236    
_________________________________________________________________
decoder_35 (Decoder)         multiple                  1010836   
_________________________________________________________________
dense_1171 (Dense)           multiple                  1648      
_________________________________________________________________
softmax_35 (Softmax)         multiple                  0         
Total params: 1,941,720
Trainable params: 1,941,720
Non-trainable params: 0
_________________________________________________________________


In [161]:
def load_data(source, validation=0.1):
    filenames = [os.path.join('texts', f) for f in source]
    train, valid = dataset.load_data(filenames, validation, maxlen=MAXLEN)
    return train, valid


def fit(data, epochs=1):
    train, valid = data
    total = len(data[0])//BATCH_SIZE
    history = collections.defaultdict(list)
    for epoch in range(epochs):
        model.reset_metrics()
        for i in range(total):
            s = slice(i*BATCH_SIZE, (i+1)*BATCH_SIZE)
            padded = np.hstack([np.ones((BATCH_SIZE, 1)), train.niqqud[s]])
            res = model.train_step(train.normalized[s], padded)
            out = ' - '.join(f"{k}: {v:.4f}" for k, v in res.items() if k != "predictions")
            print(f"{i:4d}/{total:4d} - {out}", end='      \r')
        print()
        for k, v in res.items():
            history[k].append(res[k].numpy())
    return history

In [7]:
data_other = load_data(['biblical', 'garbage'])

In [8]:
data_mix = load_data(['poetry', 'rabanit', 'pre_modern'])

In [9]:
data_modern = load_data(validation=0.1, source=['modern'])

In [165]:
model.load_weights('./checkpoints/uninit')
history = fit(data_other, epochs=2)
model.save_weights('./checkpoints/other')

1988/1989 - loss: 1.1859 - accuracy: 0.6084                                    
1988/1989 - loss: 0.5223 - accuracy: 0.8217                        


In [166]:
model.load_weights('./checkpoints/other')
history = fit(data_mix, epochs=2) # (102, 2048, 6) warmup=270 : 974,746 - 0.8323  (7 - same)
model.save_weights('./checkpoints/mix')

5940/5941 - loss: 0.3100 - accuracy: 0.8926                                                                  
5940/5941 - loss: 0.2086 - accuracy: 0.9292                                                                                                                        


In [168]:
model.load_weights('./checkpoints/mix')
history = fit(data_modern, epochs=2)
# print(true_accuracy(data_modern)
model.save_weights('./checkpoints/modern')

 683/ 684 - loss: 0.2062 - accuracy: 0.9289                  
 683/ 684 - loss: 0.1605 - accuracy: 0.9455            


In [None]:
model.load_weights('./checkpoints/modern')

def print_predictions(data, s):
    batch = data.normalized[s]
    prediction = model.predict(batch)
    [actual_niqqud, actual_dagesh, actual_sin] = [dataset.from_categorical(prediction[0]), dataset.from_categorical(prediction[1]), dataset.from_categorical(prediction[2])]
    [expected_niqqud, expected_dagesh, expected_sin] = [data.niqqud[s], data.dagesh[s], data.sin[s]]
    actual = dataset.merge(data.text[s], ts=batch, ns=actual_niqqud, ds=actual_dagesh, ss=actual_sin)
    expected = dataset.merge(data.text[s], ts=batch, ns=expected_niqqud, ds=expected_dagesh, ss=expected_sin)
    total = []
    for i, (a, e) in enumerate(zip(actual, expected)):
        print('מצוי: ', a)
        print('רצוי: ', e)
        last = expected_niqqud[i].tolist().index(0)
        res = expected_niqqud[i][:last] == actual_niqqud[i][:last]
        total.extend(res)
        print(f'{np.mean(res):.2f} ({last - sum(res)} out of {last})')
        print()
    print(round(np.mean(total), 3))

print_predictions(data_modern[1], slice(0, None))

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(nrows=1, ncols=len(history))

for i, v in enumerate(history.values()):
    ax[i].plot(v)

plt.tight_layout()

In [374]:
model.load_weights('./checkpoints/modern')

def predict(x):
    batch_len = x.shape[0]
    y_pred = tf.ones((batch_len, 1), dtype=tf.int32)

    padding_mask = transformer.create_padding_mask(x)
    dec_target_padding_mask = transformer.create_padding_mask(x)
    timesteps = x.shape[-1]
    for i in range(timesteps):
        future = tf.ones((batch_len, timesteps - i - 1), dtype=tf.int32)
        y_augment = tf.concat([y_pred, future], axis=-1)
            
        predictions, _ = model(x, y_augment, False, dec_target_padding_mask, padding_mask)
        predictions = predictions[: ,i:i+1, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
        y_pred = tf.concat([y_pred, predicted_id], axis=-1)

    return y_pred[:, 1:].numpy()
d = data_modern[1]
n = slice(0, 2*BATCH_SIZE)
output = predict(d.normalized[n])
print(list(output[0]))
print(list(d.niqqud[0]))
(d.niqqud[n] == output).mean()

[15, 1, 14, 6, 1, 1, 1, 2, 7, 1, 1, 1, 15, 7, 8, 1, 1, 1, 13, 10, 1, 1, 6, 2, 1, 14, 1, 2, 1, 14, 6, 1, 1, 1, 2, 6, 2, 1, 14, 2, 1, 14, 1, 6, 2, 1, 11, 1, 1, 0]
[15, 1, 11, 6, 1, 1, 1, 2, 7, 1, 1, 1, 15, 7, 8, 1, 1, 1, 13, 10, 1, 1, 6, 2, 1, 14, 1, 6, 1, 14, 6, 1, 1, 1, 2, 6, 2, 1, 11, 2, 1, 14, 1, 6, 2, 1, 11, 1, 1, 0]


0.9178125

In [None]:
model.load_weights('./checkpoints/modern')

d = data_modern[1]
n = slice(0, BATCH_SIZE)

y = d.niqqud[n]
x = d.normalized[n]
model.test_step(x, y)

In [356]:

def merge(normalized, prediction):
    sentence = []
    for c, n in zip(normalized, prediction):
        if c == dataset.letters_table.PAD_TOKEN:
            break
        sentence.append(dataset.letters_table.indices_char[c])
        sentence.append(dataset.niqqud_table.indices_char[n])
    return ''.join(sentence)

d = data_modern
text = d[1].normalized[0*BATCH_SIZE:1*BATCH_SIZE]
actual = d[1].niqqud[0*BATCH_SIZE:1*BATCH_SIZE]
padded_actual = np.hstack([np.ones((BATCH_SIZE, 1)), actual])[:, :-1]
print(padded_actual.shape)
print(text.shape)
dec_target_padding_mask = transformer.create_padding_mask(actual)
padding_mask = transformer.create_padding_mask(text)
prediction = model(text, padded_actual, False, dec_target_padding_mask, padding_mask)[0]  # np.argmax(history['predictions'], axis=-1)[0]
prediction = np.argmax(prediction, axis=-1)
n = 3
print(text[n])
print(prediction[n])
print(actual[n])
print(prediction[n] == actual[n])
print(np.mean(prediction == actual))
print(merge(text[n], prediction[n]))
print(merge(text[n], actual[n]))

(32, 50)
(32, 50)


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

[17 29 22  5 21 30  5 21 31 34 37 41 26 30  5 21 31 31 42 26 26 30 13  5
 20 20 40 26 33 20  5 21 41 17 21  5 17 26 27  5 29 21 19 20 26 41  5  0
  0  0]
[ 7  1 14  1  7  1  1 15  6  2 10  6  1  1  1 15 15  2  6  6  1  1  1  1
  2  2  6  1  2  1  1  8  2 10  1  1  7  1  2  1  2 15  2  6  1  1  1  0
  0  0]
[ 7  1 14  1  7  1  1 15  6  2 10  6  1  1  1 15 15 10  6  6  1  1  1  1
  8  2  6  1  2  1  1  8  2 10  1  1  7  1  2  1  2 15  2  6  1  1  1  0
  0  0]
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True False  True  True  True  True  True  True
 False  True  True  True  True  True  True  True  True  True  True  True


In [125]:
x = np.random.random((4,3))
x
np.hstack([np.ones((4, 1)), x])

array([[1.        , 0.75247506, 0.97689561, 0.34302931],
       [1.        , 0.83199473, 0.08874784, 0.22675684],
       [1.        , 0.37551285, 0.62234334, 0.75537264],
       [1.        , 0.97912614, 0.84942646, 0.32530607]])