In [1]:
%load_ext autoreload

In [191]:
import os
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
assert tf.config.list_physical_devices('GPU')

import collections

%autoreload
import dataset

%autoreload
import transformer

In [192]:
BATCH_SIZE = 32

MAXLEN = 50

LETTERS_SIZE = len(dataset.letters_table)
NIQQUD_SIZE = len(dataset.niqqud_table)
DAGESH_SIZE = len(dataset.dagesh_table)
SIN_SIZE = len(dataset.sin_table)

d_model = 102

model = transformer.Transformer(
    num_layers=1,
    d_model=d_model,
    num_heads=6,
    dff=1024,
    input_vocab_size=LETTERS_SIZE,
    target_vocab_size=NIQQUD_SIZE, 
    maximum_position_encoding_input=MAXLEN,
    maximum_position_encoding_target=MAXLEN,
    rate=0.0
)

learning_rate = transformer.CustomSchedule(d_model, warmup_steps=3000)
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss='sparse_categorical_crossentropy',  # transformer.MaskedCategoricalCrossentropy(),  # tf.keras.losses.sparse_categorical_crossentropy,
    metrics=['accuracy']  # unmasked, so incorrect
)
# pseudo "build" step, to allow printing a summary:
model.run_eagerly = True
h = model.pseudo_build(MAXLEN, MAXLEN)
model.summary()
model.save_weights('./checkpoints/uninit')

Model: "transformer_53"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_53 (Encoder)         multiple                  96322     
_________________________________________________________________
decoder_53 (Decoder)         multiple                  102454    
_________________________________________________________________
dense_917 (Dense)            multiple                  688       
_________________________________________________________________
softmax_53 (Softmax)         multiple                  0         
Total params: 199,464
Trainable params: 199,464
Non-trainable params: 0
_________________________________________________________________


In [128]:
def print_out(q, k, v, causal):
    temp_out, temp_attn = transformer.scaled_dot_product_attention(q, k, v, causal=causal, mask=None)
    print ('Attention weights are:')
    print (temp_attn)
    print ('Output is:')
    print (temp_out)

np.set_printoptions(suppress=True)

temp_k = tf.constant([[0, 0, 3, 2],
                      [1, 0, 0, 1],
                      [0, 1, 0, 3],
                      [5, 3, 2, 1]], dtype=tf.float32)

print_out(temp_k, temp_k, temp_k, causal=True)

Attention weights are:
tf.Tensor(
[[0.         0.         0.         0.        ]
 [1.         0.         0.         0.        ]
 [0.81757444 0.18242551 0.         0.        ]
 [0.5761169  0.21194157 0.21194157 0.        ]], shape=(4, 4), dtype=float32)
Output is:
tf.Tensor(
[[0.         0.         0.         0.        ]
 [0.         0.         3.         2.        ]
 [0.18242551 0.         2.4527233  1.8175744 ]
 [0.21194157 0.21194157 1.7283508  2.        ]], shape=(4, 4), dtype=float32)


In [29]:
def load_data(source, validation=0.1):
    filenames = [os.path.join('texts', f) for f in source]
    train, valid = dataset.load_data(filenames, validation, maxlen=MAXLEN)
    return train, valid


def fit(data, epochs=1):
    train, valid = data
    total = len(data[0])//BATCH_SIZE
    history = collections.defaultdict(list)
    for epoch in range(epochs):
        model.reset_metrics()
        for i in range(total):
            s = slice(i*BATCH_SIZE, (i+1)*BATCH_SIZE)
            res = model.train_step(train.normalized[s], [[0] + x for x in train.niqqud[s]])
            out = ' - '.join(f"{k}: {v:.4f}" for k, v in res.items() if k != "predictions")
            print(f"{i:4d}/{total:4d} - {out}", end='      \r')
        print()
        for k, v in res.items():
            history[k].append(res[k].numpy())
    return history

In [30]:
data_other = load_data(['biblical', 'garbage'])

In [31]:
data_mix = load_data(['poetry', 'rabanit', 'pre_modern'])

In [32]:
data_modern = load_data(validation=0.1, source=['modern'])

In [193]:
model.load_weights('./checkpoints/uninit')
history = fit(data_other, epochs=1)
model.save_weights('./checkpoints/other')

1989/1990 - loss: 1.4968 - accuracy: 0.5331                                          


In [None]:
model.load_weights('./checkpoints/other')
history = fit(data_mix, epochs=1) # (102, 2048, 6) warmup=270 : 974,746 - 0.8323  (7 - same)
model.save_weights('./checkpoints/mix')

 922/5685 - loss: 0.8498 - accuracy: 0.7036                                          

In [41]:
model.load_weights('./checkpoints/mix')
history = fit(data_modern, epochs=1)
# print(true_accuracy(data_modern)
model.save_weights('./checkpoints/modern')

 679/ 680 - loss: 0.0354 - accuracy: 0.9881                                                


In [None]:
model.load_weights('./checkpoints/modern')

def print_predictions(data, s):
    batch = data.normalized[s]
    prediction = model.predict(batch)
    [actual_niqqud, actual_dagesh, actual_sin] = [dataset.from_categorical(prediction[0]), dataset.from_categorical(prediction[1]), dataset.from_categorical(prediction[2])]
    [expected_niqqud, expected_dagesh, expected_sin] = [data.niqqud[s], data.dagesh[s], data.sin[s]]
    actual = dataset.merge(data.text[s], ts=batch, ns=actual_niqqud, ds=actual_dagesh, ss=actual_sin)
    expected = dataset.merge(data.text[s], ts=batch, ns=expected_niqqud, ds=expected_dagesh, ss=expected_sin)
    total = []
    for i, (a, e) in enumerate(zip(actual, expected)):
        print('מצוי: ', a)
        print('רצוי: ', e)
        last = expected_niqqud[i].tolist().index(0)
        res = expected_niqqud[i][:last] == actual_niqqud[i][:last]
        total.extend(res)
        print(f'{np.mean(res):.2f} ({last - sum(res)} out of {last})')
        print()
    print(round(np.mean(total), 3))

print_predictions(data_modern[1], slice(0, None))

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(nrows=1, ncols=len(history))

for i, v in enumerate(history.values()):
    ax[i].plot(v)

plt.tight_layout()

In [None]:
model.load_weights('./checkpoints/modern')
def evaluate(inp_sentence):
    encoder_input = tf.expand_dims(inp_sentence, 0)
    output = [0] * len(inp_sentence)
    size = inp_sentence.shape[0]
    last = inp_sentence.tolist().index(0)
    for i in range(last):
        padding_mask = transformer.create_padding_mask(encoder_input)
        look_ahead_mask = tf.cast(tf.constant([[[[0]*i + [1]*(MAXLEN-i)]]]), dtype=float)
        combined_mask = tf.maximum(padding_mask, look_ahead_mask)
        
        output_tensor = tf.constant([output], dtype=tf.int32)
        print(combined_mask)
        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, _ = model(encoder_input, output_tensor, False, padding_mask, combined_mask, padding_mask)
        print(predictions)
        # select the last character from the seq_len dimension
        # predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # concatentate the predicted_id to the output which is given to the decoder as its input.
        print(predicted_id)
        output[i] = predicted_id.numpy()[0][i]
        # print(combined_mask[0, 0, i, :].numpy())
        print(output)
        print()
    print(output)
evaluate(data_modern[0].normalized[0])

In [130]:

def merge(normalized, prediction):
    sentence = []
    for c, n in zip(normalized, prediction):
        if c == dataset.letters_table.PAD_TOKEN:
            break
        sentence.append(dataset.letters_table.indices_char[c])
        sentence.append(dataset.niqqud_table.indices_char[n])
    return ''.join(sentence)

d = data_modern
text = d[1].normalized[2*BATCH_SIZE:3*BATCH_SIZE]
actual = d[1].niqqud[2*BATCH_SIZE:3*BATCH_SIZE]
padding_mask = transformer.create_padding_mask(text)
prediction = model(text, actual, False, padding_mask)[0]  # np.argmax(history['predictions'], axis=-1)[0]
prediction = np.argmax(prediction, axis=-1)
n = 5
print(text[n])
print(prediction[n])
print(actual[n])
print(prediction[n] == actual[n])
print(np.mean(prediction == actual))
print(merge(text[n], prediction[n]))
print(merge(text[n], actual[n]))

[21 39 19 21  5 42 29  5 37 22 33 40 39 26 22 43  5 42 22 33 22 43  5 28
 25 22 41 26 30  5  9 41 17 22  5 25 22 41  5 25 26 26 29 22 41 10 11  5
  0  0]
[ 2  2 10  1  1  8  1  1  1 14  2  2  6  1 11  1  1  1 11  1 11  1  1 15
  1 14  6  1  1  1  1  2  1 14  1  1 14  1  1  8  1  2  1 11  1  1  1  1
  0  0]
[15 10 10  1  1  8  1  1  1 14  2  2  6  1 11  1  1  1 11  1 11  1  1 15
  1 14  6  1  1  1  1  2  1 14  1  1 14  1  1  8  1  2  1 11  1  1  1  1
  0  0]
[False False  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True]
0.97125
הְצְגָה שֶל פוּנְקְצִיוֹת שוֹנוֹת כַטוּרִים (רְאוּ טוּר טֶייְלוֹר), 
הַצָגָה שֶל פוּנְקְצִיוֹת שוֹנוֹת כַטוּרִים (רְאוּ טוּר טֶייְלוֹר), 


In [123]:
import numpy as np
a = np.array([[1, 2, 3], [4,5,6], [7, 8, 9]])
b = np.array([[1, 1, 1], [0, 1, 1], [0, 0, 1]])
c = np.array([[4, 5, 6]])
print(c.T)
a @ b @ c.T


[[4]
 [5]
 [6]]


array([[ 55],
       [151],
       [247]])

In [None]:
def print_out(q, k, v):
    temp_out, temp_attn = transformer.scaled_dot_product_attention(q, k, v, None)
    print ('Attention weights are:')
    print (temp_attn)
    print ('Output is:')
    print (temp_out)

np.set_printoptions(suppress=True)

temp_k = tf.constant([[10,0,0],
                      [0,10,0],
                      [0,0,10],
                      [0,0,10]], dtype=tf.float32)  # (4, 3)

temp_v = tf.constant([[   1,0],
                      [  10,0],
                      [ 100,5],
                      [1000,6]], dtype=tf.float32)  # (4, 2)

# This `query` aligns with the second `key`,
# so the second `value` is returned.
temp_q = tf.constant([[0, 10, 0]], dtype=tf.float32)  # (1, 3)
print_out(temp_q, temp_k, temp_v)