In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

import dataset
assert tf.config.list_physical_devices('GPU')

Using TensorFlow backend.


In [228]:
BATCH_SIZE = 32
MAXLEN = 64

LETTERS_SIZE = len(dataset.letters_table)
NIQQUD_SIZE = len(dataset.niqqud_table)
DAGESH_SIZE = len(dataset.dagesh_table)
SIN_SIZE = len(dataset.sin_table)

def build_model(EMBED_DIM=110, UNITS=220):

    layer = input_text = tf.keras.Input(batch_shape=(None, MAXLEN), batch_size=BATCH_SIZE)
    
    layer = layers.Embedding(LETTERS_SIZE, EMBED_DIM, mask_zero=True)(layer)
    layer = layers.Bidirectional(layers.LSTM(UNITS, return_sequences=True, dropout=0.0), merge_mode='sum')(layer)
    layer = layers.add([layer,
                         layers.Bidirectional(layers.LSTM(UNITS, return_sequences=True, dropout=0.0), merge_mode='sum')(layer)])

    outputs = [
        layers.Softmax(name='N')(layers.Dense(NIQQUD_SIZE)(layer)),
        layers.Softmax(name='D')(layers.Dense(DAGESH_SIZE)(layer)),
        layers.Softmax(name='S')(layers.Dense(SIN_SIZE)(layer))
    ]
    model = tf.keras.Model(inputs=[input_text], outputs=outputs)

    # tf.keras.utils.plot_model(model, to_file='model.png')
    return model

model = build_model()

def fit(data,  min_lr_1, max_lr, min_lr_2):
    adam = tf.keras.optimizers.Adam(learning_rate=min_lr_1)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    class BatchLearningRateScheduler(tf.keras.callbacks.Callback):
        def __init__(self):
            super().__init__()
            self.lr = min_lr_1

        def on_batch_end(self, batch, logs=None):
            #  lr = min_lr + batch * (max_lr - min_lr) / (11000 / BATCH_SIZE)
            batches = data.normalized_texts.shape[0]
            MID = batches / BATCH_SIZE / 2
            if batch < MID:
                span = max_lr - min_lr_1
                lr = max_lr + batch * span / MID
            else:
                batch -= MID
                span = max_lr - min_lr_2
                lr = max_lr - batch * span / MID
            tf.keras.backend.set_value(self.model.optimizer.lr, lr)
                
            
    x  = data.normalized_texts
    vx = data.normalized_validation
    y  = {'N': data.niqqud_texts,      'D': data.dagesh_texts,      'S': data.sin_texts,      'C': data.normalized_texts     }
    vy = {'N': data.niqqud_validation, 'D': data.dagesh_validation, 'S': data.sin_validation, 'C': data.normalized_validation}
    return model.fit(x, y, validation_data=(vx, vy), batch_size=BATCH_SIZE, epochs=1, #len(learning_rates),
          callbacks=[
              BatchLearningRateScheduler()
              # tf.keras.callbacks.LearningRateScheduler(lambda epoch, lr: learning_rates[epoch], verbose=0),
          ]
    )

model.summary()
model.save_weights('./checkpoints/uninit')

Model: "model_31"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_32 (InputLayer)           [(None, 64)]         0                                            
__________________________________________________________________________________________________
embedding_31 (Embedding)        (None, 64, 110)      4840        input_32[0][0]                   
__________________________________________________________________________________________________
bidirectional_62 (Bidirectional (None, 64, 220)      582560      embedding_31[0][0]               
__________________________________________________________________________________________________
bidirectional_63 (Bidirectional (None, 64, 220)      776160      bidirectional_62[0][0]           
___________________________________________________________________________________________

In [3]:
def load_data(source, maxlen=MAXLEN, validation=0.1):
    filenames = [os.path.join('texts', f) for f in source]
    return dataset.load_file(filenames, BATCH_SIZE, validation, maxlen=maxlen, shuffle=True)

In [147]:
data_rabanit = load_data(['birkat_hamazon.txt', 'kuzari.txt', 'hakdama_leorot.txt', 'hartzaat_harav.txt', 'orhot_hayim.txt', 'rambam_mamre.txt', 'short_table.txt', 'tomer_dvora.txt'])

In [148]:
data_pre_modern = load_data(['elef_layla.txt', 'bialik', 'shaul_tchernichovsky', 'breslev.txt', 'itzhak_berkman', 'zevi_scharfstein', 'pesah_kaplan', 'abraham_regelson',
                             'elisha_porat', 'uriel_ofek', 'yisrael_dushman', 'zvi_zviri', 'atar_hashabat.txt', 'ali_baba.txt'])

In [4]:
data_modern = load_data(validation=0.2, source=['forums', 'newspapers', 'wiki', 'blogs', 'adamtsair.txt', 'katarsis.txt'])  # , 'imagination.txt', 'sipurim.txt', 'ricky.txt'

In [259]:
model.load_weights('./checkpoints/uninit')  # 2e-3, 4e-3, 0.05e-3 0.9702
history = fit(data_rabanit, 20e-4, 50e-4, 5e-4)
model.save_weights('./checkpoints/rabanit')

Train on 76377 samples, validate on 8487 samples


In [268]:
model.load_weights('./checkpoints/rabanit')
history = fit(data_pre_modern, 20e-4, 40e-4, 0.1e-4)
model.save_weights('./checkpoints/pre_modern')

Train on 84096 samples, validate on 9344 samples


In [274]:
model.load_weights('./checkpoints/pre_modern')
history = fit(data_modern, 2e-3, 3e-3, 1e-3)
history = fit(data_modern, 8e-4, 1e-3, 2e-4)

Train on 11417 samples, validate on 2855 samples
Train on 11417 samples, validate on 2855 samples


In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(nrows=2, ncols=2)

for n, v in enumerate(['accuracy', 'loss'], 0):
    for n1, t in enumerate(['D', 'N'], 0):
        p = ax[n][n1]
        p.plot(history.history[t + '_' + v][0:])
        p.plot(history.history['val_' + t + '_' +  v][0:])
        p.legend([t + '_Train', t + '_Test'], loc='center right')

plt.tight_layout()

In [276]:
import tensorflowjs as tfjs
tfjs.converters.save_keras_model(model, '.')

In [275]:
def print_predictions(data, k):
    s = slice(k*BATCH_SIZE, (k+1)*BATCH_SIZE)
    batch = data.normalized_validation[s]
    prediction = model.predict(batch)
    [actual_niqqud, actual_dagesh, actual_sin] = [dataset.from_categorical(prediction[0]), dataset.from_categorical(prediction[1]), dataset.from_categorical(prediction[2])]
    [expected_niqqud, expected_dagesh, expected_sin] = [data.niqqud_validation[s], data.dagesh_validation[s], data.sin_validation[s]]
    actual = data.merge(batch, ns=actual_niqqud, ds=actual_dagesh, ss=actual_sin)
    expected = data.merge(batch, ns=expected_niqqud, ds=expected_dagesh, ss=expected_sin)
    for i, (a, e) in enumerate(zip(actual, expected)):
        print('מצוי: ', a)
        print('רצוי: ', e)
        print()

print_predictions(data_modern, 0)

מצוי:  הַסַּפְסָל בַּהַפְסָקוֹת, לְהִתְנַדֵּב וְלִהְיוֹת הַיַּלְדָּה שֶׁעוֹמֶדֶת וּמְגַלְגֶּלֶת בְּחֶבֶל וּבִכְלָלִי
רצוי:  הַסַּפְסָל בַּהַפְסָקוֹת, לְהִתְנַדֵּב וְלִהְיוֹת הַיַּלְדָּה שֶׁעוֹמֶדֶת וּמְגַלְגֶּלֶת בְּחֶבֶל וּבִכְלָלֵי

מצוי:  הָרְצִיפוּת הַתִּפְקוּדִית הֵן הָרָשׁוּיוֹת הַמְּקוֹמִיּוֹת, שֶׁיִּצְטָרְכוּ לְהַמְשִׁיךְ לְתַפְקֵד גַּם
רצוי:  הָרְצִיפוּת הַתִּפְקוּדִית הֵן הָרָשׁוּיוֹת הַמְּקוֹמִיּוֹת, שֶׁיִּצְטָרְכוּ לְהַמְשִׁיךְ לְתַפְקֵד גַּם

מצוי:  שֶׁנִּדְבְּקוּ אֵלָיו בִּזְמַנִּים מוּקְדָּמִים יוֹתֵר. לְרֶגֶל ט"וּ בְּאָב, שֶׁלְּמַעֲשֶׂה מִסְמֵל
רצוי:  שֶׁנִּדְבְּקוּ אֵלָיו בִּזְמַנִּים מוּקְדָּמִים יוֹתֵר. לְרֶגֶל ט"ו בְּאָב, שֶׁלְּמַעֲשֶׂה מְסַמֵּל

מצוי:  עִירָאק, בִּירַת הָאוֹטוֹנוֹמְיָה הַכּוֹרְדִּית. פַּרְשָׁנִים בָּעוֹלָם הָעַרְבִי אָמְנָם דִּיבְּרוּ
רצוי:  עִירָאק, בִּירַת הָאוֹטוֹנוֹמְיָה הַכּוּרְדִּית. פַּרְשָׁנִים בָּעוֹלָם הָעַרְבִי אָמְנָם דִּיבְּרוּ

מצוי:  הַבִּיוֹלוֹגִית יָצְרָה מוֹחַ אֱנוֹשִׁי מוּרְכָּב וּמְשׁוּכְלָל, אַךְ חָשׂוּף יוֹתֵר לִפְגָמִים
רצוי: