In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

import dataset
assert tf.config.list_physical_devices('GPU')

Using TensorFlow backend.


In [47]:
BATCH_SIZE = 32
MAXLEN = 200

def build_model():
    EMBED_DIM = 512
    UNITS = 256

    LETTERS_SIZE = len(dataset.letters_table)
    NIQQUD_SIZE = len(dataset.niqqud_table)
    DAGESH_SIZE = len(dataset.dagesh_table)
    SIN_SIZE = len(dataset.sin_table)

    common_input = tf.keras.Input(batch_shape=(None, MAXLEN), batch_size=BATCH_SIZE)
    
    common = layers.Embedding(LETTERS_SIZE, EMBED_DIM, mask_zero=True)(common_input)
    common = layers.Bidirectional(layers.LSTM(UNITS, return_sequences=True, dropout=0.1), merge_mode='sum')(common)
    common = layers.add([common,
                         layers.Bidirectional(layers.LSTM(UNITS, return_sequences=True, dropout=0.1), merge_mode='sum')(common)])

    outputs = [
        layers.Softmax(name='N')(layers.Dense(NIQQUD_SIZE)(common)),
        layers.Softmax(name='D')(layers.Dense(DAGESH_SIZE)(common)),
        layers.Softmax(name='S')(layers.Dense(SIN_SIZE)(common))
    ]
    model = tf.keras.Model(inputs=[common_input], outputs=outputs)

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    tf.keras.utils.plot_model(model, to_file='model.png')
    model.summary()
    return model
    
model = build_model()
model.save_weights('./checkpoints/uninit')

def fit(data, learning_rates):
    x  = data.normalized_texts
    vx = data.normalized_validation
    y  = {'N': data.niqqud_texts,      'D': data.dagesh_texts,      'S': data.sin_texts,      'C': data.normalized_texts     }
    vy = {'N': data.niqqud_validation, 'D': data.dagesh_validation, 'S': data.sin_validation, 'C': data.normalized_validation}
    return model.fit(x, y, validation_data=(vx, vy), batch_size=BATCH_SIZE, epochs=len(learning_rates),
          callbacks=[
              tf.keras.callbacks.LearningRateScheduler(lambda epoch, lr: learning_rates[epoch], verbose=0),
              # tf.keras.callbacks.ModelCheckpoint(filepath='checkpoints/ckpt_{epoch}', save_weights_only=True),
          ]
    )

Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.
Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 200, 512)     22528       input_10[0][0]                   
__________________________________________________________________________________________________
bidirectional_18 (Bidirectional (None, 200, 256)     1574912     embedding_9[0][0]                
__________________________________________________________________________________________________
bidirectional_19 (Bidirectional (None, 200, 256)     1050624     bidirectional_18[0][0]           
______

In [48]:
def load_data(source, maxlen=MAXLEN):
    filenames = [os.path.join('texts', f) for f in source]
    return dataset.load_file(filenames, BATCH_SIZE, 0.1, maxlen=maxlen, shuffle=True)

In [53]:
data_rabanit = load_data(['birkat_hamazon.txt', 'kuzari.txt', 'hakdama_leorot.txt', 'hartzaat_harav.txt', 'orhot_hayim.txt', 'rambam_mamre.txt', 'short_table.txt', 'tomer_dvora.txt'])

In [54]:
data_pre_modern = load_data(['elef_layla.txt', 'bialik', 'shaul_tchernichovsky', 'breslev.txt', 'itzhak_berkman', 'zevi_scharfstein', 'pesah_kaplan', 'abraham_regelson',
                             'elisha_porat', 'uriel_ofek', 'yisrael_dushman', 'zvi_zviri', 'atar_hashabat.txt', 'ali_baba.txt'])

In [49]:
data_modern = load_data(['papers', 'wiki', 'sipurim.txt', 'ricky.txt', 'imagination.txt', 'adamtsair.txt', 'katarsis.txt'])

In [55]:
model.load_weights('./checkpoints/uninit')
history = fit(data_rabanit, [3e-3, 3e-4])
model.save_weights('./checkpoints/rabanit')

Train on 23788 samples, validate on 2644 samples
Epoch 1/2
Epoch 2/2


In [56]:
model.load_weights('./checkpoints/rabanit')
history = fit(data_pre_modern, [3e-3, 3e-4])
model.save_weights('./checkpoints/pre_modern')

Train on 26150 samples, validate on 2906 samples
Epoch 1/2
Epoch 2/2


In [57]:
model.load_weights('./checkpoints/pre_modern')
history = fit(data_modern, [3e-3, 3e-4])

Train on 1094 samples, validate on 122 samples
Epoch 1/2
Epoch 2/2


In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(nrows=2, ncols=2)

for n, v in enumerate(['accuracy', 'loss'], 0):
    for n1, t in enumerate(['D', 'N'], 0):
        p = ax[n][n1]
        p.plot(history.history[t + '_' + v][0:])
        p.plot(history.history['val_' + t + '_' +  v][0:])
        p.legend([t + '_Train', t + '_Test'], loc='center right')

plt.tight_layout()

In [59]:
import tensorflowjs as tfjs
tfjs.converters.save_keras_model(model, '.')

In [58]:
def print_predictions(data, k):
    s = slice(k*BATCH_SIZE, (k+1)*BATCH_SIZE)
    batch = data.normalized_validation[s]
    prediction = model.predict(batch)
    [actual_niqqud, actual_dagesh, actual_sin] = [dataset.from_categorical(prediction[0]), dataset.from_categorical(prediction[1]), dataset.from_categorical(prediction[2])]
    [expected_niqqud, expected_dagesh, expected_sin] = [data.niqqud_validation[s], data.dagesh_validation[s], data.sin_validation[s]]
    actual = data.merge(batch, ns=actual_niqqud, ds=actual_dagesh, ss=actual_sin)
    expected = data.merge(batch, ns=expected_niqqud, ds=expected_dagesh, ss=expected_sin)
    for i, (a, e) in enumerate(zip(actual, expected)):
        print('מצוי: ', a)
        print('רצוי: ', e)
        print()

print_predictions(load_data(['papers', 'wiki'], maxlen=MAXLEN), 1)

מצוי:  תָּמִיד מִפוּזָרִים פְּקָקִים שֶׁל הַבַּקְבּוּקִים בְּכָל מָקוֹם", דָּנִית מְגַלָּה. "אֲנִי חוֹשֶׁבֶת שֶׁכְּשֶׁנִּכְנָסִים לְזוּגִיּוֹת צָרִיךְ לְהִתְרַחֵב וּלְהָבִין שֶׁאֲנַחְנוּ חַיִּים פֹּה בְּיַחַד, וַאֲנַחְנוּ רוֹצִים לַעֲשׂוֹת טוֹב לְקַשֶׁר. חָשׁוּב שֶׁלֹּא יִהְיוּ הִתְחַשְׁבְנוּיוֹת, וּמָה שֶׁצָרִיךְ לַעֲשׂוֹת -
רצוי:  תָּמִיד מְפוּזָּרִים פְּקָקִים שֶׁל הַבַּקְבּוּקִים בְּכָל מָקוֹם", דָּנִית מְגַלָּה. "אֲנִי חוֹשֶׁבֶת שֶׁכְּשֶׁנִּכְנָסִים לַזּוּגִיּוּת צָרִיךְ לְהִתְרַחֵב וּלְהָבִין שֶׁאֲנַחְנוּ חַיִּים פֹּה בְּיַחַד, וַאֲנַחְנוּ רוֹצִים לַעֲשׂוֹת טוֹב לְקַשֵּׁר. חָשׁוּב שֶׁלֹּא יִהְיוּ הִתְחַשְׁבְּנוּיוֹת, וּמָה שֶׁצָּרִיךְ לַעֲשׂוֹת -

מצוי:  טִבְעִיִּים. מַעֲרֶכֶת הַמִּסְפָּרִים הַמִּתְקַבֶּלֶת בְּאוֹפֶן זֶה (מִמֶנָה שֶׁל שְׁנֵי מִסְפָּרִים טִבְעִיִּים) נִקְרֵאת הַיּוֹם הַמִּסְפָּרִים הָרַצְיוֹנָלִיִּים הַחִיּוּבִיִּים. וְאוּלָם הַמָּתֵמָטִיקָאִים הַפִּיתְגוֹרָאִים הֶרָאוּ שֶׁבְּרִיבּוּעַ, שֶׁאוֹרֶךְ צִלְעוּ הוּא 5, לֹא נִיתָּן לְהַצִיג אֶת אוֹרֶךְ הָאֲלַכְסוֹן
רצוי: