In [1]:
import os
import datetime

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers

import dataset
assert tf.config.list_physical_devices('GPU')


Using TensorFlow backend.


In [15]:
MAXLEN = 128
BATCH_SIZE = 32

def build_model():
    EMBED_DIM = 512
    UNITS = 256

    NIQQUD_SIZE = len(dataset.niqqud_table)
    DAGESH_SIZE = len(dataset.dagesh_table)
    LETTERS_SIZE = len(dataset.letters_table)

    common_input = tf.keras.Input(batch_shape=(None, MAXLEN), batch_size=BATCH_SIZE)
    common = layers.Embedding(LETTERS_SIZE, EMBED_DIM, mask_zero=True)(common_input)
    common = layers.Bidirectional(layers.LSTM(UNITS, return_sequences=True, dropout=0.1), merge_mode='sum')(common)

    common = layers.add([common, layers.Bidirectional(layers.LSTM(UNITS, return_sequences=True, dropout=0.1), merge_mode='sum')(common)])

    niqqud = layers.Softmax(name='N')(layers.Dense(NIQQUD_SIZE)(common))
    dagesh = layers.Softmax(name='D')(layers.Dense(DAGESH_SIZE)(common))

    model = tf.keras.Model(inputs=[common_input], outputs=[niqqud, dagesh])

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    tf.keras.utils.plot_model(model, to_file='model.png')
    model.summary()
    return model
    
model = build_model()

def fit(data, learning_rates):
    return model.fit(data.normalized_texts, [data.niqqud_texts, data.dagesh_texts],
          batch_size=BATCH_SIZE,
          epochs=len(learning_rates),
          validation_data=(data.normalized_validation, [data.niqqud_validation,  data.dagesh_validation]),
          callbacks=[
              tf.keras.callbacks.LearningRateScheduler(lambda epoch, lr: learning_rates[epoch], verbose=0),
              # tf.keras.callbacks.ModelCheckpoint(filepath='checkpoints/ckpt_{epoch}', save_weights_only=True),
          ]
    )

Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 128, 512)     22016       input_4[0][0]                    
__________________________________________________________________________________________________
bidirectional_6 (Bidirectional) (None, 128, 256)     1574912     embedding_3[0][0]                
__________________________________________________________________________________________________
bidirectional_7 (Bidirectional) (None, 128, 256)     1050624     bidirectional_6[0][0]            
______

In [16]:
rabanit = ['birkat_hamazon.txt', 'hakdama_leorot.txt', 'hartzaat_harav.txt', 'orhot_hayim.txt', 'rambam_mamre.txt', 'short_table.txt', 'tomer_dvora.txt', 'breslev.txt', 'elef_layla.txt']
modern = ['abraham_regelson', 'elisha_porat', 'uriel_ofek', 'yisrael_dushman', 'zvi_zviri',
          'crazy_night.txt', 'sipurim.txt' ,'atar_hashabat.txt', 'ali_baba.txt', 'people.txt', 'ricky.txt', 'imagination.txt', 'adamtsair.txt', 'katarsis.txt']

def load_data(source):
    filenames = [os.path.join('texts', f) for f in source]
    return dataset.load_file(filenames, BATCH_SIZE, 0.1, maxlen=MAXLEN, shuffle=True)

data_rabanit = load_data(rabanit)
data_modern = load_data(modern)

In [17]:
history = fit(data_rabanit, [2e-3])
model.save_weights('./checkpoint_rabanit')

Train on 45619 samples, validate on 5069 samples


In [18]:
model.load_weights('./checkpoint_rabanit')
history = fit(data_modern, [1e-3])

Train on 11001 samples, validate on 1223 samples


In [None]:
fig, ax = plt.subplots(nrows=2, ncols=2)

for n, v in enumerate(['accuracy', 'loss'], 0):
    for n1, t in enumerate(['D', 'N'], 0):
        p = ax[n][n1]
        p.plot(history.history[t + '_' + v][0:])
        p.plot(history.history['val_' + t + '_' +  v][0:])
        p.legend([t + '_Train', t + '_Test'], loc='center right')

plt.tight_layout()

In [20]:
import tensorflowjs as tfjs
tfjs.converters.save_keras_model(model, '.')

In [19]:
def print_predictions(data, k):
    s = slice(k*BATCH_SIZE, (k+1)*BATCH_SIZE)
    batch = data.normalized_validation[s]
    prediction = model.predict(batch)
    [actual_niqqud, actual_dagesh] = [dataset.from_categorical(prediction[0]), dataset.from_categorical(prediction[1])]
    [expected_niqqud, expected_dagesh] = [data.niqqud_validation[s], data.dagesh_validation[s]]
    actual = data.merge(batch, ns=actual_niqqud, ds=actual_dagesh)
    expected = data.merge(batch, ns=expected_niqqud, ds=expected_dagesh)
    for i, (a, e) in enumerate(zip(actual, expected)):
        print('מצוי: ', a)
        print('רצוי: ', e)
        print()

print_predictions(data_modern, 1)

מצוי:  בַּעֲלִיצוּת. אוֹתוֹ רֶגַע הִגַעְנוּ אֶל הַסִירוֹת. ד"ר לִיבְסִי נָטַל מִכּוֹש בְּיָדוֹ וְשָבֶר לִרְסִיסִים אַחַת מֵהֶן; אַחַר-כָּךְ עָלֵינוּ עַל הַשְּנִיָּה וְהִתְחַלְנוּ חוֹתְרִים לְעֵבֶר
רצוי:  בַּעֲלִיצוּת. אוֹתוֹ רֶגַע הִגַעְנוּ אֶל הַסִירוֹת. ד"ר לִיבְסִי נָטַל מַכּוֹש בְּיָדוֹ וְשָבַר לִרְסִיסִים אַחַת מֵהֶן; אַחַר-כָּךְ עָלִינוּ עַל הַשְנִיָה וְהִתְחַלְנוּ חוֹתְרִים לְעֵבֶר

מצוי:  לְבֵית הַכְּנֶסֶת לִתְפִלַּת מִנְחָה שֶל יוֹם שִשִּי, לִתְפִלַּת קַבְּלַת שַבָּת, וְלִתְפִלַּת עַרְבִית שֶל שַבָּת. בְּנוֹת שְרוּצוֹת לְהִתְפַּלֵּל, יְכוֹלוֹת לָלֶכֶת לְבֵית הַכְּנֶסֶת לְעֶזְרַת הַנָּשִים,
רצוי:  לְבֵית הַכְּנֶסֶת לִתְפִלַּת מִנְחָה שֶל יוֹם שִשִּי, לִתְפִלַּת קַבָּלַת שַבָּת, וְלִתְפִלַּת עַרְבִית שֶל שַבָּת. בָּנוֹת שֶרוֹצוֹת לְהִתְפַּלֵּל, יְכוֹלוֹת לָלֶכֶת לְבֵּית הַכְּנֶסֶת לְעֶזְרַת הַנָּשִים,

מצוי:  שֶהוּא שוֹתֶה?" הֵרִים טְרִילוֹנֵי אֶת קוֹלוֹ. "לֹא, אֲדוֹנִי," הֵשִיב הַקְבַרְנִיט, "אֶלָּא שֶהוּא מִתְנַהֵג בְּקִרְבָה יְתֵרָה". "וְעַכְשָו, קַפִּיטַן, נִגַּש לְעֶצֶם הָע