In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

import tensorflow as tf
import tensorflowjs as tfjs

import wandb
from wandb.keras import WandbCallback

import dataset
import schedulers

assert tf.config.list_physical_devices('GPU')

In [3]:
LETTERS_SIZE = len(dataset.letters_table)
NIQQUD_SIZE = len(dataset.niqqud_table)
DAGESH_SIZE = len(dataset.dagesh_table)
SIN_SIZE = len(dataset.sin_table)

def build_model(units=500, maxlen=64):
    inp = keras.Input(shape=(maxlen,), batch_size=None)
    embed = layers.Embedding(LETTERS_SIZE, units, mask_zero=True)(inp)
    
    layer = layers.Bidirectional(layers.LSTM(units, return_sequences=True), merge_mode='sum')(embed)
    layer = layers.add([layer, layers.Bidirectional(layers.LSTM(units, return_sequences=True), merge_mode='sum')(layer)])
    layer = layers.BatchNormalization()(layer)
    layer = layers.add([embed, layers.Dense(units, activation='relu')(layer)])

    outputs = [
        layers.Softmax(name='N')(layers.Dense(NIQQUD_SIZE)(layer)),
        layers.Softmax(name='D')(layers.Dense(DAGESH_SIZE)(layer)),
        layers.Softmax(name='S')(layers.Dense(SIN_SIZE)(layer)),
    ]
    model = keras.Model(inputs=inp, outputs=outputs)

    return model


In [4]:

# masked version of accuracy and sce
def accuracy(real, pred):
    acc = tf.keras.metrics.sparse_categorical_accuracy(real, pred)

    mask = tf.cast(tf.math.logical_not(tf.math.equal(real, 0)), dtype=acc.dtype)
    acc *= mask

    return tf.reduce_sum(acc) / tf.reduce_sum(mask)

def sparse_categorical_crossentropy(y_true, y_pred, sample_weight=None):
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)

    mask = tf.cast(tf.math.logical_not(tf.math.equal(y_true, 0)), dtype=loss.dtype)
    loss *= mask

    return tf.reduce_sum(loss) / tf.reduce_sum(mask) 

def get_xy(d):
    if d is None:
        return None
    x = d.normalized
    y = {'N': d.niqqud, 'D': d.dagesh, 'S': d.sin }
    return (x, y)


In [5]:
BATCH_SIZE = 64
MAXLEN = 64

In [26]:
np.random.seed(1)

data = {}
data['mix'] = dataset.load_data([
    'hebrew_diacritized_private/poetry',
    'hebrew_diacritized_private/rabanit',
    'hebrew_diacritized_private/pre_modern'], validation_rate=0, maxlen=MAXLEN)

data['modern'] = dataset.load_data([
    'hebrew_diacritized/modern'], validation_rate=0, maxlen=MAXLEN)



In [39]:
%env WANDB_MODE dryrun
maxlen = MAXLEN

def experiment(architecture):
    UNITS=500
    np.random.seed(2)
    model = architecture(units=UNITS)
    model.compile(loss=sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  metrics={'N': accuracy, 'D': accuracy, 'S': accuracy})

    model.save_weights('./checkpoints/uninit')
    
    
    config = {
        'batch_size': BATCH_SIZE,
        'maxlen': MAXLEN,
        'batch_size': BATCH_SIZE,
        'units': UNITS,
        'model': model,
        'order': [
            ('mix',    [(30e-4, 80e-4, 1e-4)], 'mix'),
            ('modern', [(50e-4, 50e-4, 1e-5)], 'modern'),
            ('modern', [(50e-4, 50e-4, 1e-5),
                        #(50e-4, 50e-4, 1e-5),
                       ], 'modern_over'),
        ],
    }

    run = wandb.init(project="dotter",
                     group="architecture",
                     name=f'model_{architecture.__name__}',
                     tags=['CLR', 'ordered'],
                     config=config)

    with run:
        for kind, clrs, save in config['order']:
            train, validation = data[kind]

            training_data = (x, y) = get_xy(train)
            validation_data = get_xy(validation)

            wandb_callback = WandbCallback(log_batch_frequency=10, training_data=training_data, validation_data=validation_data,
                                           log_weights=True)
            for clr in clrs:
                scheduler = schedulers.CircularLearningRate(*clr)
                scheduler.set_dataset(train, BATCH_SIZE)
                callbacks = [wandb_callback, scheduler]
                history = model.fit(x, y, # validation_data=validation_data,
                                    batch_size=BATCH_SIZE, verbose=1, callbacks=callbacks)
            model.save(os.path.join(wandb.run.dir, save + ".h5"))
            model.save_weights('./checkpoints/' + save)
    return model

def linear_last_dense_no_batchnorm(units):
    inp = keras.Input(shape=(maxlen,), batch_size=None)
    embed = layers.Embedding(LETTERS_SIZE, units, mask_zero=True)(inp)
    
    layer = layers.Bidirectional(layers.LSTM(units, return_sequences=True), merge_mode='sum')(embed)
    layer = layers.add([layer, layers.Bidirectional(layers.LSTM(units, return_sequences=True), merge_mode='sum')(layer)])
    layer = layers.add([embed, layers.Dense(units)(layer)])

    outputs = [
        layers.Softmax(name='N')(layers.Dense(NIQQUD_SIZE)(layer)),
        layers.Softmax(name='D')(layers.Dense(DAGESH_SIZE)(layer)),
        layers.Softmax(name='S')(layers.Dense(SIN_SIZE)(layer)),
    ]
    return keras.Model(inputs=inp, outputs=outputs)

for architecture in [linear_last_dense_no_batchnorm]:
    model = experiment(architecture)

env: WANDB_MODE=dryrun




In [None]:
model.load_weights('./checkpoints/modern_over')

model.compile()
model.save('modern.h5')
tfjs.converters.save_keras_model(model, '.')

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(nrows=2, ncols=3)

for n, v in enumerate(['accuracy', 'loss'], 0):
    for n1, t in enumerate(['N', 'D', 'S'], 0):
        p = ax[n][n1]
        p.plot(history.history[t + '_' + v][0:])
        p.plot(history.history['val_' + t + '_' +  v][0:])
        p.legend([t + '_Train', t + '_Test'], loc='center right')

plt.tight_layout()

In [40]:
np.random.seed(3)
test, _ = dataset.load_data(['test/modernTestCorpus/'], 0, MAXLEN)

In [41]:
# model = linear_last_dense_no_batchnorm(units=500)
# model.load_weights('./checkpoints/modern_over')
x = test.normalized
y = {'N': test.niqqud, 'D': test.dagesh, 'S': test.sin }

model.compile(loss=sparse_categorical_crossentropy,
              metrics={'N': accuracy, 'D': accuracy, 'S': accuracy})

_ = model.evaluate(x=x, y=y, batch_size=BATCH_SIZE)



In [44]:
model.load_weights('./checkpoints/modern_over')

def real_evaluation(data, s=slice(0, None), print_comparison=True):
    batch = data.normalized[s]
    prediction = model.predict(batch)
    [actual_niqqud, actual_dagesh, actual_sin] = [dataset.from_categorical(prediction[0]), dataset.from_categorical(prediction[1]), dataset.from_categorical(prediction[2])]
    [expected_niqqud, expected_dagesh, expected_sin] = [data.niqqud[s], data.dagesh[s], data.sin[s]]
    actual = dataset.merge(data.text[s], batch, actual_niqqud, actual_dagesh, actual_sin)
    expected = dataset.merge(data.text[s], batch, expected_niqqud, expected_dagesh, expected_sin)
    total_letters = []
    total_words = []
    for i, (b, a, e) in enumerate(zip(batch, actual, expected)):
        letters = []
        letters.extend(expected_niqqud[i][expected_niqqud[i]>0] == actual_niqqud[i][expected_niqqud[i]>0])
        letters.extend(expected_dagesh[i][expected_dagesh[i]>0] == actual_dagesh[i][expected_dagesh[i]>0])
        letters.extend(expected_sin[i][expected_sin[i]>0] == actual_sin[i][expected_sin[i]>0])
        total_letters.extend(letters)
        words = []
        for aw, ew in zip(a.split(), e.split()):
            if len([x for x in 'אבגדהוזחטיכלמנסעפצקרשתךםןףץ' if x in aw]) > 1:
                words.append(aw == ew)
                if print_comparison and aw != ew:
                    print(aw, ew)
        total_words.extend(words)
        if print_comparison:
            print('מצוי: ', a)
            print('רצוי: ', e)
            print(f'{np.mean(letters):.2%} ({len(letters)-np.sum(letters)} out of {len(letters)})')
            print(f'{np.mean(words):.2%} ({len(words)-np.sum(words)} out of {len(words)})')
            print()
    print(f'letters: {np.mean(total_letters):.2%}, words: {np.mean(total_words):.2%}')

real_evaluation(test, s=slice(0, None), print_comparison=True)  #  letters: 95.11%, words: 77.43%

בְּתּוֹכְנִית בְּתוֹכְנִית
מצוי:  אֲנָשִׁים שֶׁבָּאוּ לְטִיפּוּל אֶצְלוֹ.[1] בְּ-2013 הִשְׁתַּתְּפוּ בְּתּוֹכְנִית "הָאָח הַגָּדוֹל" 
רצוי:  אֲנָשִׁים שֶׁבָּאוּ לְטִיפּוּל אֶצְלוֹ.[1] בְּ-2013 הִשְׁתַּתְּפוּ בְּתוֹכְנִית "הָאָח הַגָּדוֹל" 
98.72% (1 out of 78)
87.50% (1 out of 8)

הַנְּדְרִי הֶנְדְרִי
מצוי:  שָׁנִים עַל יְדֵי סְטִיבֶן הַנְּדְרִי כְּשֶׁזָּכָה בְּאַלִּיפוּת הָעוֹלָם בְּגִיל 21). לְאַחַר מִכֵּן 
רצוי:  שָׁנִים עַל יְדֵי סְטִיבֶן הֶנְדְרִי כְּשֶׁזָּכָה בְּאַלִּיפוּת הָעוֹלָם בְּגִיל 21). לְאַחַר מִכֵּן 
97.59% (2 out of 83)
90.91% (1 out of 11)

צָדִיִּים צִדִּיִּים
הֲפָקְתָם הֲפָקָתָם
מצוי:  נְזִילִים (חוֹכְכִים-מְקוֹרָבִים) צָדִיִּים הֵם עִיצּוּרִים שֶׁבְּמַהֲלַךְ הֲפָקְתָם זֶרֶם 
רצוי:  נְזִילִים (חוֹכְכִים-מְקוֹרָבִים) צִדִּיִּים הֵם עִיצּוּרִים שֶׁבְּמַהֲלַךְ הֲפָקָתָם זֶרֶם 
95.95% (3 out of 74)
75.00% (2 out of 8)

לִזְכֶּר לְזֵכֶר
מצוי:  אַנְדַּרְטַת הַקִּיבּוּצִים אַנְדַּרְטַת הַקִּיבּוּצִים לִזְכֶּר חַבְרֵי הַקִּיבּוּצִים שֶׁנָּפְלוּ 
רצוי:  אַנְדַּרְטַת הַקִּיבּ

requests_with_retry encountered retryable exception: 500 Server Error: Internal Server Error for url: https://api.wandb.ai/files/elazarg/dotter/ubxusps3/file_stream. args: ('https://api.wandb.ai/files/elazarg/dotter/ubxusps3/file_stream',), kwargs: {'json': {'files': {'output.log': {'offset': 225, 'content': ['2020-07-19T13:50:30.125831 \n', '2020-07-19T13:50:30.156835 בְּאַיָאן בָּאיָאן\n', '2020-07-19T13:50:30.156835 מְבַאְרִין[1] מִבָּאָרִין[1]\n', "2020-07-19T13:50:30.157837 וְאֶג'וֹ. וְאָג'וּ.\n", '2020-07-19T13:50:30.157837 קוֹבְּלָאי קוּבְּלַאי\n', "2020-07-19T13:50:30.168834 מצוי:  בְּאַיָאן מְבַאְרִין[1] וְאֶג'וֹ. בִּסְבִיבוֹת 1271 אִירְגֵּן אוֹתָם קוֹבְּלָאי חָאן \n", "2020-07-19T13:50:30.169836 רצוי:  בָּאיָאן מִבָּאָרִין[1] וְאָג'וּ. בִּסְבִיבוֹת 1271 אִירְגֵּן אוֹתָם קוּבְּלַאי חָאן \n", '2020-07-19T13:50:30.169836 80.95% (12 out of 63)\n', '2020-07-19T13:50:30.169836 50.00% (4 out of 8)\n', '2020-07-19T13:50:30.169836 \n', '2020-07-19T13:50:30.170834 אָבִידָע אֲבִידָע\n',

In [None]:
import hebrew
import dataset

'len'