In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
import os

import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

import tensorflow as tf
import tensorflowjs as tfjs

import wandb
from wandb.keras import WandbCallback

import dataset
import schedulers

assert tf.config.list_physical_devices('GPU')

In [6]:
def masked_metric(v, y_true):
    mask = tf.math.not_equal(y_true, 0)
    return tf.reduce_sum(tf.boolean_mask(v, mask)) / tf.cast(tf.math.count_nonzero(mask), tf.float32)

def accuracy(y_true, y_pred):
    return masked_metric(tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred), y_true)

def sparse_categorical_crossentropy(y_true, y_pred):
    return masked_metric(tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True), y_true)

def get_xy(d):
    if d is None:
        return None
    d.shuffle()
    x = d.normalized
    y = {'N': d.niqqud, 'D': d.dagesh, 'S': d.sin }
    return (x, y)


In [7]:
corpus = {}
corpus['mix'] = dataset.read_corpora([
    'hebrew_diacritized_private/poetry',
    'hebrew_diacritized_private/rabanit',
    'hebrew_diacritized_private/pre_modern'])

corpus['modern'] = dataset.read_corpora([
    'hebrew_diacritized/modern'])


In [8]:
MAXLEN = 80

data = {}
np.random.seed(2)
data['mix'] = dataset.load_data(corpus['mix'], validation_rate=0.1, maxlen=MAXLEN)
np.random.seed(2)
data['modern'] = dataset.load_data(corpus['modern'], validation_rate=0.1, maxlen=MAXLEN)


In [15]:
LETTERS_SIZE = len(dataset.letters_table)
NIQQUD_SIZE = len(dataset.niqqud_table)
DAGESH_SIZE = len(dataset.dagesh_table)
SIN_SIZE = len(dataset.sin_table)

def build_model(units):
    inp = keras.Input(shape=(None,), batch_size=None)
    embed = layers.Embedding(LETTERS_SIZE, units, mask_zero=True)(inp)
    
    layer = layers.Bidirectional(layers.LSTM(units, return_sequences=True, dropout=0.1), merge_mode='sum')(embed)
    layer = layers.add([layer, layers.Bidirectional(layers.LSTM(units, return_sequences=True, dropout=0.1), merge_mode='sum')(layer)])
    layer = layers.Dense(units)(layer)

    outputs = [
        layers.Dense(NIQQUD_SIZE, name='N')(layer),
        layers.Dense(DAGESH_SIZE, name='D')(layer),
        layers.Dense(SIN_SIZE, name='S')(layer),
    ]
    return keras.Model(inputs=inp, outputs=outputs)


In [20]:
%env WANDB_MODE run

def experiment():
    BATCH_SIZE = 64
    UNITS = 512
    np.random.seed(2)
    model = build_model(units=UNITS)
    model.compile(loss=sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  metrics=accuracy)

    model.save_weights('./checkpoints/uninit')
#     model.load_weights('./checkpoints/modern_over2')
    
    config = {
        'batch_size': BATCH_SIZE,
        'maxlen': MAXLEN,
        'units': UNITS,
        'model': model,
        'order': [
              ('mix',    (3e-3, 8e-3, 0), 'mix'),
              ('modern', (4e-3, 2e-3, 5e-4), 'modern'),
              ('modern', (27e-4, 11e-4, 8e-4), 'modern_over'),
              ('modern', (15e-4, 10e-4, 5e-4), 'modern_over1'),
              ('modern', (12e-4, 8e-4, 5e-4), 'modern_over2'),
              ('modern', (8e-4, 8e-4, 1e-4), 'modern_over3'),
        ],
    }

    run = wandb.init(project="dotter",
                     group="mix_and_modern",
                     name=f'no dropout',
                     tags=['CLR', 'ordered'],
                     config=config)
    with run:
        for kind, clr, save in config['order']:
            train, validation = data[kind]

            training_data = (x, y) = get_xy(train)
            validation_data = get_xy(validation)

            wandb_callback = WandbCallback(log_batch_frequency=10,  # int(len(train.normalized) / BATCH_SIZE / 100),
                                           training_data=training_data,
                                           validation_data=validation_data,
                                           log_weights=True)
            scheduler = schedulers.CircularLearningRate(*clr)
            scheduler.set_dataset(train, BATCH_SIZE)
            callbacks = [wandb_callback, scheduler]
            history = model.fit(x, y, validation_data=validation_data,
                                batch_size=BATCH_SIZE, verbose=1, callbacks=callbacks)
            model.save_weights('./checkpoints/' + save)
            model.save(os.path.join(wandb.run.dir, save + ".h5"))
    return model

model = experiment()

env: WANDB_MODE=run


  48/1716 [..............................] - ETA: 3:20 - loss: 2.3928 - N_loss: 1.5475 - D_loss: 0.4473 - S_loss: 0.3980 - N_accuracy: 0.4603 - D_accuracy: 0.8232 - S_accuracy: 0.8688

wandb: Network error resolved after 0:00:17.032618, resuming normal operation.




UnknownError: Failed to rename: ./checkpoints/modern_over3_temp_3159ed33ee8d4e93a3bd188a96021ed2/part-00000-of-00001.data-00000-of-00001 to: ./checkpoints/modern_over3.data-00000-of-00001 : Access is denied.
; Input/output error [Op:MergeV2Checkpoints]

In [19]:
model = build_model(units=512)
model.load_weights('./checkpoints/modern_over2')

model.compile()
model.save('models/modern.h5')
tfjs.converters.save_keras_model(model, 'models/')

# hack around tfjs bug:
with open('models/model.json', encoding='utf8', mode='r') as f:
    text = f.read().replace('"Functional"', '"Model"')
with open('models/model.json', encoding='utf8', mode='w') as f:
    f.write(text)



In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(nrows=2, ncols=3)

for n, v in enumerate(['accuracy', 'loss'], 0):
    for n1, t in enumerate(['N', 'D', 'S'], 0):
        p = ax[n][n1]
        p.plot(history.history[t + '_' + v][0:])
        p.plot(history.history['val_' + t + '_' +  v][0:])
        p.legend([t + '_Train', t + '_Test'], loc='center right')

plt.tight_layout()

In [None]:
np.random.seed(3)
test, _ = dataset.load_data(dataset.read_corpora(['test/modernTestCorpus/']), 0, MAXLEN)

In [None]:
model = build_model(units=800)
model.load_weights('./checkpoints/modern_over2')
x = test.normalized
y = {'N': test.niqqud, 'D': test.dagesh, 'S': test.sin }

model.compile(loss=sparse_categorical_crossentropy,
              metrics={'N': accuracy, 'D': accuracy, 'S': accuracy})

_ = model.evaluate(x=x, y=y, batch_size=64)

In [21]:
def real_evaluation(data, s=slice(0, None), print_comparison=True):
    batch = data.normalized[s]
    prediction = model.predict(batch)
    [actual_niqqud, actual_dagesh, actual_sin] = [dataset.from_categorical(prediction[0]), dataset.from_categorical(prediction[1]), dataset.from_categorical(prediction[2])]
    [expected_niqqud, expected_dagesh, expected_sin] = [data.niqqud[s], data.dagesh[s], data.sin[s]]
    actual_niqqud[expected_niqqud==0] = 0
    actual_dagesh[expected_dagesh==0] = 0
    actual_sin[expected_sin==0] = 0
    actual = dataset.merge(data.text[s], batch, actual_niqqud, actual_dagesh, actual_sin)
    expected = dataset.merge(data.text[s], batch, expected_niqqud, expected_dagesh, expected_sin)
    total_decisions = []
    total_words = []
    total_letters = []
    for i, (b, a, e) in enumerate(zip(batch, actual, expected)):
        decisions = []
        decisions.extend(expected_niqqud[i][expected_niqqud[i]>0] == actual_niqqud[i][expected_niqqud[i]>0])
        decisions.extend(expected_dagesh[i][expected_dagesh[i]>0] == actual_dagesh[i][expected_dagesh[i]>0])
        decisions.extend(expected_sin[i][expected_sin[i]>0] == actual_sin[i][expected_sin[i]>0])
        total_decisions.extend(decisions)
        

        either = (expected_niqqud[i]>0) | (expected_dagesh[i]>0) | (expected_sin[i]>0)
        letters = ((expected_niqqud[i][either] == actual_niqqud[i][either])
                 & (expected_dagesh[i][either] == actual_dagesh[i][either])
                 & (expected_sin[i][either]    == actual_sin[i][either]))
        total_letters.extend(letters)
        
        words = []
        for aw, ew in zip(a.split(), e.split()):
            if len([x for x in 'אבגדהוזחטיכלמנסעפצקרשתךםןףץ' if x in aw]) > 1:
                words.append(aw == ew)
                if print_comparison and aw != ew:
                    print(aw, ew)
        total_words.extend(words)
        
        if print_comparison:
            print('מצוי: ', a)
            print('רצוי: ', e)
            print(f'letters: {np.mean(letters):.2%} ({len(letters)-np.sum(letters)} out of {len(letters)})')
            print(f'decisions: {np.mean(decisions):.2%} ({len(decisions)-np.sum(decisions)} out of {len(decisions)})')
            print(f'words: {np.mean(words):.2%} ({len(words)-np.sum(words)} out of {len(words)})')
            print()
    print(f'letters: {np.mean(total_letters):.2%}, decisions: {np.mean(total_decisions):.2%}, words: {np.mean(total_words):.2%}')

model = build_model(units=512)
model.load_weights('./checkpoints/modern_over2')
real_evaluation(data['modern'][1], s=slice(0, None), print_comparison=False)
# own: letters: 95.59%, decisions: 97.50%, words: 87.64%

letters: 95.36%, decisions: 97.36%, words: 86.67%


In [None]:
import hebrew
import dataset

In [None]:
print(data['modern'][1].normalized[0])
print(data['modern'][1].niqqud[0])

In [None]:
%env WANDB_MODE run
config = {
        'batch_size': 64,
        'units': 500,
        'order': [
            ('mix',    [(30e-4, 80e-4, 1e-4)], 'mix'),
            ('modern', [(50e-4, 50e-4, 1e-5)], 'modern'),
            ('modern', [(50e-4, 50e-4, 1e-5),
                        # (50e-4, 50e-4, 1e-5),
                       ], 'modern_over'),
        ],
    }
run = wandb.init(project="dotter",
                 # group="maxlen",
                 name=f'maxlen_test',
                 tags=['CLR', 'ordered'],
                 config=config)

with run:
    for maxlen, letters, words in [
            (75, 0.9511, 0.7778),
            (80, 0.9531, 0.7819),
            (85, 0.9535, 0.7819),
            (90, 0.9526, 0.7841),
            (95, 0.9514, 0.7795),
    ]:
        run.log({'maxlen': maxlen,
                 'letters': letters,
                 'words': words})


In [None]:
model = build_model(units=400)
model.summary()