In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

import tensorflow as tf
import tensorflowjs as tfjs

import wandb
from wandb.keras import WandbCallback

import dataset
import schedulers

assert tf.config.list_physical_devices('GPU')

In [3]:

# masked version of accuracy and sce
def accuracy(real, pred):
    acc = tf.keras.metrics.sparse_categorical_accuracy(real, pred)

    mask = tf.cast(tf.math.logical_not(tf.math.equal(real, 0)), dtype=acc.dtype)
    acc *= mask

    return tf.reduce_sum(acc) / tf.reduce_sum(mask)

def sparse_categorical_crossentropy(y_true, y_pred, sample_weight=None):
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)

    mask = tf.cast(tf.math.logical_not(tf.math.equal(y_true, 0)), dtype=loss.dtype)
    loss *= mask

    return tf.reduce_sum(loss) / tf.reduce_sum(mask) 

def get_xy(d):
    if d is None:
        return None
    x = d.normalized
    y = {'N': d.niqqud, 'D': d.dagesh, 'S': d.sin }
    return (x, y)


In [11]:
corpus = {}
corpus['mix'] = dataset.read_corpora([
    'hebrew_diacritized_private/poetry',
    'hebrew_diacritized_private/rabanit',
    'hebrew_diacritized_private/pre_modern'])

corpus['modern'] = dataset.read_corpora([
    'hebrew_diacritized/modern'])


In [5]:
LETTERS_SIZE = len(dataset.letters_table)
NIQQUD_SIZE = len(dataset.niqqud_table)
DAGESH_SIZE = len(dataset.dagesh_table)
SIN_SIZE = len(dataset.sin_table)

def build_model(units):
    inp = keras.Input(shape=(None,), batch_size=None)
    embed = layers.Embedding(LETTERS_SIZE, units, mask_zero=True)(inp)
    
    layer = layers.Bidirectional(layers.LSTM(units, return_sequences=True, dropout=0.1), merge_mode='concat')(embed)
    layer = layers.add([layer, layers.Bidirectional(layers.LSTM(units, return_sequences=True, dropout=0.1), merge_mode='concat')(layer)])
    layer = layers.Dense(units)(layer)

    outputs = [
        layers.Softmax(name='N')(layers.Dense(NIQQUD_SIZE)(layer)),
        layers.Softmax(name='D')(layers.Dense(DAGESH_SIZE)(layer)),
        layers.Softmax(name='S')(layers.Dense(SIN_SIZE)(layer)),
    ]
    return keras.Model(inputs=inp, outputs=outputs)


In [12]:
MAXLEN = 90
np.random.seed(2)

data = {}
data['mix'] = dataset.load_data(corpus['mix'], validation_rate=0, maxlen=MAXLEN)
data['modern'] = dataset.load_data(corpus['modern'], validation_rate=0, maxlen=MAXLEN)


In [13]:
%env WANDB_MODE dryrun

def experiment():
    BATCH_SIZE = 64
    UNITS = 500
    np.random.seed(2)
    model = build_model(units=UNITS)
    model.compile(loss=sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  metrics={'N': accuracy, 'D': accuracy, 'S': accuracy})

    model.save_weights('./checkpoints/uninit')
    model.load_weights('./checkpoints/mix')
    
    config = {
        'batch_size': BATCH_SIZE,
        'maxlen': MAXLEN,
        'units': UNITS,
        'model': model,
        'order': [
#             ('mix',    (30e-4, 80e-4, 1e-4), 'mix'),
            ('modern', (40e-4, 40e-4, 1e-5), 'modern'),
            ('modern', (40e-4, 40e-4, 1e-5), 'modern_over'),
            ('modern', (40e-4, 40e-4, 1e-5), 'modern_over1'),
            ('modern', (40e-4, 40e-4, 1e-5), 'modern_over2'),
        ],
    }

    run = wandb.init(project="dotter",
                     group="with_govil",
                     name=f'fix_vav_noval',
                     tags=['CLR', 'ordered'],
                     config=config)

    with run:
        for kind, clr, save in config['order']:
            train, validation = data[kind]

            training_data = (x, y) = get_xy(train)
            validation_data = get_xy(validation)

            wandb_callback = WandbCallback(log_batch_frequency=10,  # int(len(train.normalized) / BATCH_SIZE / 100),
                                           training_data=training_data,
                                           validation_data=validation_data,
                                           log_weights=True)
            scheduler = schedulers.CircularLearningRate(*clr)
            scheduler.set_dataset(train, BATCH_SIZE)
            callbacks = [wandb_callback, scheduler]
            history = model.fit(x, y, validation_data=validation_data,
                                batch_size=BATCH_SIZE, verbose=1, callbacks=callbacks)
            model.save(os.path.join(wandb.run.dir, save + ".h5"))
            model.save_weights('./checkpoints/' + save)
    return model

model = experiment()

env: WANDB_MODE=dryrun




In [None]:
model = build_model(units=500)
model.load_weights('./checkpoints/modern_over2')

model.compile()
model.save('models/modern.h5')
tfjs.converters.save_keras_model(model, 'models/')

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(nrows=2, ncols=3)

for n, v in enumerate(['accuracy', 'loss'], 0):
    for n1, t in enumerate(['N', 'D', 'S'], 0):
        p = ax[n][n1]
        p.plot(history.history[t + '_' + v][0:])
        p.plot(history.history['val_' + t + '_' +  v][0:])
        p.legend([t + '_Train', t + '_Test'], loc='center right')

plt.tight_layout()

In [8]:
np.random.seed(3)
test, _ = dataset.load_data(dataset.read_corpora(['test/modernTestCorpus/']), 0, MAXLEN)

In [None]:
model = build_model(units=700)
model.load_weights('./checkpoints/modern_over2')
x = test.normalized
y = {'N': test.niqqud, 'D': test.dagesh, 'S': test.sin }

model.compile(loss=sparse_categorical_crossentropy,
              metrics={'N': accuracy, 'D': accuracy, 'S': accuracy})

_ = model.evaluate(x=x, y=y, batch_size=64)

In [23]:
model.load_weights('./checkpoints/modern_over2')

def real_evaluation(data, s=slice(0, None), print_comparison=True):
    batch = data.normalized[s]
    prediction = model.predict(batch)
    [actual_niqqud, actual_dagesh, actual_sin] = [dataset.from_categorical(prediction[0]), dataset.from_categorical(prediction[1]), dataset.from_categorical(prediction[2])]
    [expected_niqqud, expected_dagesh, expected_sin] = [data.niqqud[s], data.dagesh[s], data.sin[s]]
    actual = dataset.merge(data.text[s], batch, actual_niqqud, actual_dagesh, actual_sin)
    expected = dataset.merge(data.text[s], batch, expected_niqqud, expected_dagesh, expected_sin)
    total_decisions = []
    total_words = []
    total_letters = []
    for i, (b, a, e) in enumerate(zip(batch, actual, expected)):
        decisions = []
        decisions.extend(expected_niqqud[i][expected_niqqud[i]>0] == actual_niqqud[i][expected_niqqud[i]>0])
        decisions.extend(expected_dagesh[i][expected_dagesh[i]>0] == actual_dagesh[i][expected_dagesh[i]>0])
        decisions.extend(expected_sin[i][expected_sin[i]>0] == actual_sin[i][expected_sin[i]>0])
        total_decisions.extend(decisions)
        
        actual_niqqud[i][expected_niqqud[i]==0] = 0
        actual_dagesh[i][expected_dagesh[i]==0] = 0
        actual_sin[i][expected_sin[i]==0] = 0
        either = (expected_niqqud[i]>0) | (expected_dagesh[i]>0) | (expected_sin[i]>0)
        letters = ((expected_niqqud[i][either] == actual_niqqud[i][either])
                 & (expected_dagesh[i][either] == actual_dagesh[i][either])
                 & (expected_sin[i][either]    == actual_sin[i][either]))
        total_letters.extend(letters)
        
        words = []
        for aw, ew in zip(a.split(), e.split()):
            if len([x for x in 'אבגדהוזחטיכלמנסעפצקרשתךםןףץ' if x in aw]) > 1:
                words.append(aw == ew)
                if print_comparison and aw != ew:
                    print(aw, ew)
        total_words.extend(words)
        
        if print_comparison:
            print('מצוי: ', a)
            print('רצוי: ', e)
            print(f'letters: {np.mean(letters):.2%} ({len(letters)-np.sum(letters)} out of {len(letters)})')
            print(f'decisions: {np.mean(decisions):.2%} ({len(decisions)-np.sum(decisions)} out of {len(decisions)})')
            print(f'words: {np.mean(words):.2%} ({len(words)-np.sum(words)} out of {len(words)})')
            print()
    print(f'letters: {np.mean(total_letters):.2%}, decisions: {np.mean(total_decisions):.2%}, words: {np.mean(total_words):.2%}')

real_evaluation(test, s=slice(0, None), print_comparison=True)  # letters: 95.44%, words: 79.09%

מצוי:  שִׁמְעוֹן פֶּרֶס.
רצוי:  שִׁמְעוֹן פֶּרֶס.
letters: 100.00% (0 out of 8)
decisions: 100.00% (0 out of 14)
words: 100.00% (0 out of 2)

סְנוֹקֵר סְנוּקֶר
מצוי:  מִשְׂחֲקוֹ הַמָּהִיר. קַרְיֵירָה הִיגִינְס הֵחֵל לְשַׂחֵק סְנוֹקֵר מִקְצוֹעָנִי בְּעוֹנַת 1971/1972, בְּגִיל 22. הוּא זָכָה 
רצוי:  מִשְׂחֲקוֹ הַמָּהִיר. קַרְיֵירָה הִיגִינְס הֵחֵל לְשַׂחֵק סְנוּקֶר מִקְצוֹעָנִי בְּעוֹנַת 1971/1972, בְּגִיל 22. הוּא זָכָה 
letters: 96.43% (2 out of 56)
decisions: 98.08% (2 out of 104)
words: 91.67% (1 out of 12)

מַמְּנִיעִים מִמְּנִיעִים
א-פּוֹלִיטִיִּים, אַ-פּוֹלִיטִיִּים,
מצוי:  מַמְּנִיעִים א-פּוֹלִיטִיִּים, סוֹצְיוֹלוֹגִיִּים אוֹ מִגְדָּרִיִּים, לָרוֹב מִתּוֹךְ אִידֵאוֹלוֹגְיָה שֶׁל צְדָקָה וּגְמִילוּת חֲסָדִים. 
רצוי:  מִמְּנִיעִים אַ-פּוֹלִיטִיִּים, סוֹצְיוֹלוֹגִיִּים אוֹ מִגְדָּרִיִּים, לָרוֹב מִתּוֹךְ אִידֵאוֹלוֹגְיָה שֶׁל צְדָקָה וּגְמִילוּת חֲסָדִים. 
letters: 97.01% (2 out of 67)
decisions: 98.43% (2 out of 127)
words: 83.33% (2 out of 12)

בְּאִיָאן בָּאיָאן
מְבָאֲרִין[1] מִב

In [None]:
import hebrew
import dataset

In [None]:
%env WANDB_MODE run
config = {
        'batch_size': 64,
        'units': 500,
        'order': [
            ('mix',    [(30e-4, 80e-4, 1e-4)], 'mix'),
            ('modern', [(50e-4, 50e-4, 1e-5)], 'modern'),
            ('modern', [(50e-4, 50e-4, 1e-5),
                        # (50e-4, 50e-4, 1e-5),
                       ], 'modern_over'),
        ],
    }
run = wandb.init(project="dotter",
                 # group="maxlen",
                 name=f'maxlen_test',
                 tags=['CLR', 'ordered'],
                 config=config)

with run:
    for maxlen, letters, words in [
            (75, 0.9511, 0.7778),
            (80, 0.9531, 0.7819),
            (85, 0.9535, 0.7819),
            (90, 0.9526, 0.7841),
            (95, 0.9514, 0.7795),
    ]:
        run.log({'maxlen': maxlen,
                 'letters': letters,
                 'words': words})


In [None]:
model = build_model(units=500)
model.summary()