In [1]:
import os
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

import dataset
import schedulers

import tensorflow as tf
assert tf.config.list_physical_devices('GPU')

In [9]:
BATCH_SIZE = 32

LETTERS_SIZE = len(dataset.letters_table)
NIQQUD_SIZE = len(dataset.niqqud_table)
DAGESH_SIZE = len(dataset.dagesh_table)
SIN_SIZE = len(dataset.sin_table)

def build_model(UNITS=380):  # EMBED_DIM=28
    inp = keras.Input(batch_shape=(None, None), batch_size=BATCH_SIZE)
    embed = layers.Embedding(LETTERS_SIZE, UNITS, mask_zero=True)(inp)
    
    bidi = layers.Bidirectional(layers.LSTM(UNITS, return_sequences=True), merge_mode='sum')
    layer = bidi(embed)
    layer = layer + bidi(layer)
    layer = embed + layers.Dense(UNITS, activation='relu')(layer)

    outputs = [
        layers.Softmax(name='N')(layers.Dense(NIQQUD_SIZE)(layer)),
        layers.Softmax(name='D')(layers.Dense(DAGESH_SIZE)(layer)),
        layers.Softmax(name='S')(layers.Dense(SIN_SIZE)(layer)),
    ]
    model = keras.Model(inputs=inp, outputs=outputs)

    return model

model = build_model()

model.summary()
model.save_weights('./checkpoints/uninit')

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 380)    16720       input_4[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, None, 380)    866780      embedding_3[0][0]                
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) (None, None, 380)    2313440     conv1d_1[0][0]                   
                                                                 bidirectional_3[0][0]      

In [4]:
# masked version of accuracy and sce
def accuracy(real, pred):
    acc = tf.keras.metrics.sparse_categorical_accuracy(real, pred)

    mask = tf.cast(tf.math.logical_not(tf.math.equal(real, 0)), dtype=acc.dtype)
    acc *= mask

    return tf.reduce_sum(acc) / tf.reduce_sum(mask)

def sparse_categorical_crossentropy(y_true, y_pred, sample_weight=None):
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)

    mask = tf.cast(tf.math.logical_not(tf.math.equal(y_true, 0)), dtype=loss.dtype)
    loss *= mask

    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

def fit(train_validation, scheduler=None, verbose=1, lr=1e-4, epochs=1):
    train, valid = train_validation
    model.compile(loss=sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                  metrics=[accuracy])
    callbacks = []
    if isinstance(scheduler, schedulers.CircularLearningRate):
        scheduler.set_dataset(train, BATCH_SIZE)
    if scheduler:
        callbacks.append(scheduler)
        
    x  = train.normalized
    vx = valid.normalized
    
    y  = {'N': train.niqqud, 'D': train.dagesh, 'S': train.sin }
    vy = {'N': valid.niqqud, 'D': valid.dagesh, 'S': valid.sin }
    
    return model.fit(x, y, validation_data=(vx, vy), batch_size=BATCH_SIZE, epochs=epochs, verbose=verbose, callbacks=callbacks)


MAXLEN = 64
def load_data(source, maxlen=MAXLEN, validation=0.1):
    filenames = [os.path.join('texts', f) for f in source]
    train, valid = dataset.load_data(filenames, validation, maxlen=maxlen)
    return train, valid

In [5]:
data_mix = load_data(['poetry', 'rabanit', 'pre_modern'])

In [10]:
model.load_weights('./checkpoints/uninit')
history = fit(data_mix, scheduler=schedulers.CircularLearningRate(30e-4, 80e-4, 1e-4))
model.save_weights('./checkpoints/mix')



In [6]:
data_modern = load_data(validation=0.2, source=['modern'])

In [11]:
model.load_weights('./checkpoints/mix')
history = fit(data_modern, scheduler=schedulers.CircularLearningRate(50e-4, 60e-4, 1e-5))  #  EMBED_DIM=28, UNITS=253: val_N_accuracy: 0.9575 - val_D_accuracy: 0.9856 - val_S_accuracy: 0.9994
history = fit(data_modern, scheduler=schedulers.CircularLearningRate(1e-4, 6e-4, 1e-5), epochs=8)
model.save_weights('./checkpoints/modern')

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [15]:
model.load_weights('./checkpoints/modern')
history = fit(data_modern, scheduler=schedulers.CircularLearningRate(1e-4, 6e-4, 1e-5), epochs=3)
model.save_weights('./checkpoints/modern_over')

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [23]:
model.load_weights('./checkpoints/modern')
model.save('modern.h5')

In [24]:
import tensorflowjs as tfjs
# model.load_weights('./checkpoints/modern')
tfjs.converters.save_keras_model(model, '.')

  return h5py.File(h5file)


In [25]:
model.load_weights('./checkpoints/modern')

def print_predictions(data, s):
    batch = data.normalized[s]
    prediction = model.predict(batch)
    [actual_niqqud, actual_dagesh, actual_sin] = [dataset.from_categorical(prediction[0]), dataset.from_categorical(prediction[1]), dataset.from_categorical(prediction[2])]
    [expected_niqqud, expected_dagesh, expected_sin] = [data.niqqud[s], data.dagesh[s], data.sin[s]]
    actual = dataset.merge(data.text[s], ts=batch, ns=actual_niqqud, ds=actual_dagesh, ss=actual_sin)
    expected = dataset.merge(data.text[s], ts=batch, ns=expected_niqqud, ds=expected_dagesh, ss=expected_sin)
    total = []
    for i, (a, e) in enumerate(zip(actual, expected)):
        print('מצוי: ', a)
        print('רצוי: ', e)
        last = expected_niqqud[i].tolist().index(0)
        res = expected_niqqud[i][:last] == actual_niqqud[i][:last]
        total.extend(res)
        print(round(np.mean(res), 2), f'({last - sum(res)} out of {last})')
        print()
    print(round(np.mean(total), 3))

print_predictions(data_modern[1], slice(0, None))

מצוי:  מִכוָור. לְאַחַר מִכֵּן, מֵת הַמּוֹשֵׁל הָרוֹמָאִי בִּפְּרוֹבִינַצְיָה וּבִמְקוֹמוֹ נִשְׁלַח כְּנְצִיב 
רצוי:  מִכְוּוֹר. לְאַחַר מִכֵּן, מֵת הַמּוֹשֵׁל הָרוֹמָאִי בַּפְּרוֹבִינְצִיָּה וּבִמְקוֹמוֹ נִשְׁלַח כִּנְצִיב 
0.89 (7 out of 63)

מצוי:  יוֹתֵר מֵהַזְּכָרִים. לַכְּרִישׁ חוֹטֶם מְחוֹדָד וְשִׁינַּיִים חַדוֹת וּמְעוּקָּלוֹת הַמַּזְכִּירוֹת 
רצוי:  יוֹתֵר מֵהַזְּכָרִים. לְכָרִישׁ חוֹטֶם מְחוּדָּד וְשִׁינַּיִים חַדּוֹת וּמְעוּקְּלוֹת הַמַּזְכִּירוֹת 
0.94 (4 out of 62)

מצוי:  מֵהַפְּגִיעָה, אֵיךְ מַחְזִירִים מֵהַתּוֹפַת, מְבִיאִים הַכָּרָה, רִיפּוּי וְצֶדֶק. הַמְּדִינָה 
רצוי:  מֵהַפְּגִיעָה, אֵיךְ מַחֲזִירִים מֵהַתּוֹפֶת, מְבִיאִים הַכָּרָה, רִיפּוּי וְצֶדֶק. הַמְּדִינָה 
0.97 (2 out of 61)

מצוי:  טוֹב הִתְיַישֵּׁב אַבָּא לְיַד שׁוּלְחַן עֲבוֹדָה, הוֹצִיא נִיר וְעַט מִן הַמִּגָרָה וְהִתְחִיל 
רצוי:  טוֹב הִתְיַישֵּׁב אַבָּא לְיַד שׁוּלְחָן עֲבוֹדָה, הוֹצִיא נְיָר וְעֵט מִן הַמְּגֵרָה וְהִתְחִיל 
0.9 (6 out of 62)

מצוי:  לְהוֹכִיחַ, עַל-סְמַךְ הַשְּׁאֵלוֹת שֶׁהֶעֱמִיד בִּפְ

In [None]:
for i in range(50):
    model.load_weights('./checkpoints/pre_modern')
    p1 = np.exp(np.random.uniform(low=np.log(1e-5), high=np.log(1e-2)))
    p2 = np.exp(np.random.uniform(low=np.log(1e-4), high=np.log(1e-1)))
    p3 = np.exp(np.random.uniform(low=np.log(1e-5), high=np.log(1e-2)))
    print(p1, p2, p3, end=', ', sep=', ')
    history = fit(data_modern, scheduler=schedulers.CircularLearningRate(p1, p2, p3), verbose=0)
    print(history.history['val_N_accuracy'][0])

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(nrows=2, ncols=2)

for n, v in enumerate(['accuracy', 'loss'], 0):
    for n1, t in enumerate(['D', 'N'], 0):
        p = ax[n][n1]
        p.plot(history.history[t + '_' + v][0:])
        p.plot(history.history['val_' + t + '_' +  v][0:])
        p.legend([t + '_Train', t + '_Test'], loc='center right')

plt.tight_layout()

In [6]:
import hebrew
print(hebrew.ENDINGS_TO_REGULAR)

{'ך': 'כ', 'ם': 'מ', 'ן': 'נ', 'ף': 'פ', 'ץ': 'צ'}
