In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

import tensorflow as tf
import tensorflowjs as tfjs

import wandb
from wandb.keras import WandbCallback

import dataset
import schedulers

assert tf.config.list_physical_devices('GPU')

In [3]:
def masked_metric(v, y_true):
    mask = tf.math.not_equal(y_true, 0)
    return tf.reduce_sum(tf.boolean_mask(v, mask)) / tf.cast(tf.math.count_nonzero(mask), tf.float32)

def accuracy(y_true, y_pred):
    return masked_metric(tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred), y_true)

def sparse_categorical_crossentropy(y_true, y_pred):
    return masked_metric(tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True), y_true)

def get_xy(d):
    if d is None:
        return None
    d.shuffle()
    x = d.normalized
    y = {'N': d.niqqud, 'D': d.dagesh, 'S': d.sin }
    return (x, y)


In [4]:
corpus = {}
corpus['mix'] = dataset.read_corpora([
    'hebrew_diacritized/poetry',
    'hebrew_diacritized/rabanit',
    'hebrew_diacritized/pre_modern'])

corpus['modern'] = dataset.read_corpora([
    'hebrew_diacritized/modern'])


In [5]:
MAXLEN = 90

data = {}
np.random.seed(2)
data['mix'] = dataset.load_data(corpus['mix'], validation_rate=0.01, maxlen=MAXLEN)
np.random.seed(2)
data['modern'] = dataset.load_data(corpus['modern'], validation_rate=0.1, maxlen=MAXLEN)


In [6]:
LETTERS_SIZE = len(dataset.letters_table)
NIQQUD_SIZE = len(dataset.niqqud_table)
DAGESH_SIZE = len(dataset.dagesh_table)
SIN_SIZE = len(dataset.sin_table)

def build_model(units):
    inp = keras.Input(shape=(None,), batch_size=None)
    embed = layers.Embedding(LETTERS_SIZE, units, mask_zero=True)(inp)
    
    layer = layers.Bidirectional(layers.LSTM(units, return_sequences=True, dropout=0.1), merge_mode='sum')(embed)
    layer = layers.Bidirectional(layers.LSTM(units, return_sequences=True, dropout=0.1), merge_mode='sum')(layer)
    layer = layers.Dense(units)(layer)

    outputs = [
        layers.Dense(NIQQUD_SIZE, name='N')(layer),
        layers.Dense(DAGESH_SIZE, name='D')(layer),
        layers.Dense(SIN_SIZE, name='S')(layer),
    ]
    return keras.Model(inputs=inp, outputs=outputs)


def real_evaluation(model, data, s=slice(0, None), print_comparison=True):
    batch = data.normalized[s]
    prediction = model.predict(batch)
    [actual_niqqud, actual_dagesh, actual_sin] = [dataset.from_categorical(prediction[0]), dataset.from_categorical(prediction[1]), dataset.from_categorical(prediction[2])]
    [expected_niqqud, expected_dagesh, expected_sin] = [data.niqqud[s], data.dagesh[s], data.sin[s]]
    actual_niqqud[expected_niqqud==0] = 0
    actual_dagesh[expected_dagesh==0] = 0
    actual_sin[expected_sin==0] = 0
    actual = dataset.merge(data.text[s], batch, actual_niqqud, actual_dagesh, actual_sin)
    expected = dataset.merge(data.text[s], batch, expected_niqqud, expected_dagesh, expected_sin)
    total_decisions = []
    total_words = []
    total_letters = []
    for i, (b, a, e) in enumerate(zip(batch, actual, expected)):
        decisions = []
        decisions.extend(expected_niqqud[i][expected_niqqud[i]>0] == actual_niqqud[i][expected_niqqud[i]>0])
        decisions.extend(expected_dagesh[i][expected_dagesh[i]>0] == actual_dagesh[i][expected_dagesh[i]>0])
        decisions.extend(expected_sin[i][expected_sin[i]>0] == actual_sin[i][expected_sin[i]>0])
        total_decisions.extend(decisions)
        

        either = (expected_niqqud[i]>0) | (expected_dagesh[i]>0) | (expected_sin[i]>0)
        letters = ((expected_niqqud[i][either] == actual_niqqud[i][either])
                 & (expected_dagesh[i][either] == actual_dagesh[i][either])
                 & (expected_sin[i][either]    == actual_sin[i][either]))
        total_letters.extend(letters)
        
        words = []
        for aw, ew in zip(a.split(), e.split()):
            if len([x for x in 'אבגדהוזחטיכלמנסעפצקרשתךםןףץ' if x in aw]) > 1:
                words.append(aw == ew)
                if print_comparison and aw != ew:
                    print(aw, ew)
        total_words.extend(words)
        
        if print_comparison:
            print('מצוי: ', a)
            print('רצוי: ', e)
            print(f'letters: {np.mean(letters):.2%} ({len(letters)-np.sum(letters)} out of {len(letters)})')
            print(f'decisions: {np.mean(decisions):.2%} ({len(decisions)-np.sum(decisions)} out of {len(decisions)})')
            print(f'words: {np.mean(words):.2%} ({len(words)-np.sum(words)} out of {len(words)})')
            print()
    letters = np.mean(total_letters)
    decisions = np.mean(total_decisions)
    words = np.mean(total_words)
    print(f'letters: {letters:.2%}, decisions: {decisions:.2%}, words: {words:.2%}')
    return (letters, decisions, words)


In [7]:
%env WANDB_MODE dryrun

def experiment(n):
    BATCH_SIZE = 64
    UNITS = 400
    np.random.seed(2)
    model = build_model(units=UNITS)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  metrics=accuracy)

#     model.save_weights('./checkpoints/uninit')
#     model.load_weights('./checkpoints/mix')
    
    modern_lrs = [30e-4, 30e-4, 30e-4,  8e-4, 1e-4]
    
    config = {
        'batch_size': BATCH_SIZE,
        'maxlen': MAXLEN,
        'units': UNITS,
        'experiment_id': n,
        'order': [
              ('mix',    0, 1, schedulers.CircularLearningRate(3e-3, 8e-3, 0e-4, data['mix'][0], BATCH_SIZE), 'mix'),
              ('modern', 1, (1 + len(modern_lrs)), tf.keras.callbacks.LearningRateScheduler(lambda epoch, lr: modern_lrs[epoch - 1]), 'modern'),
        ],
    }

    run = wandb.init(project="dotter",
                     group=f"{MAXLEN=} 30:80:0, " + '-'.join(f'{lr}' for lr in modern_lrs),
#                      name=f'30-80-0, 20-30-20-5-1',
                     tags=[],
                     config=config)
    with run:
        for kind, initial_epoch, epochs, scheduler, save in config['order']:
            train, validation = data[kind]

            training_data = (x, y) = get_xy(train)
            validation_data = get_xy(validation)

            wandb_callback = WandbCallback(log_batch_frequency=50,  # int(len(train.normalized) / BATCH_SIZE / 100),
                                           training_data=training_data,
                                           validation_data=validation_data,
                                           save_model=False,
                                           log_weights=False)
            
            history = model.fit(x, y, validation_data=validation_data,
                                initial_epoch=initial_epoch,
                                epochs=epochs,
                                batch_size=BATCH_SIZE, verbose=1,
                                callbacks=[wandb_callback, scheduler])
            
            letters, decisions, words = real_evaluation(model, data['modern'][1], s=slice(0, None), print_comparison=False)
            model.save_weights('./checkpoints/' + save)
        run.log({'index': 0, 'letters': letters, 'decisions': decisions, 'words': words})
    return model

for n in range(1):
    model = experiment(n)   # 20-30-20-5-1: 88.08-88.16

env: WANDB_MODE=dryrun


wandb: Offline run mode, not syncing to the cloud.
wandb: W&B is disabled in this directory.  Run `wandb on` to enable cloud syncing.


letters: 85.48%, decisions: 91.57%, words: 64.75%
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
letters: 95.92%, decisions: 97.68%, words: 88.22%


0,1
loss,0.04595
N_loss,0.02882
D_loss,0.01556
S_loss,0.00158
N_accuracy,0.99031
D_accuracy,0.99442
S_accuracy,0.99954
_step,69.0
_runtime,479.0
_timestamp,1611256519.0


0,1
loss,█▅▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
N_loss,█▆▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
D_loss,█▅▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
S_loss,█▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
N_accuracy,▁▃▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇███████████████████
D_accuracy,▁▆▇▇▇▇▇▇▇▇██████████████████████████████
S_accuracy,▁▇▇▇▇███████████████████████████████████
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇███


wandb: You can sync this run to the cloud by running:
wandb: wandb sync wandb\offline-run-20210121_210719-2z1mn0qk


In [None]:
model = build_model(units=512)
model.load_weights('./checkpoints/modern_over2')

model.compile()
model.save('models/modern.h5')
tfjs.converters.save_keras_model(model, 'models/')

# hack around tfjs bug:
with open('models/model.json', encoding='utf8', mode='r') as f:
    text = f.read().replace('"Functional"', '"Model"')
with open('models/model.json', encoding='utf8', mode='w') as f:
    f.write(text)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(nrows=2, ncols=3)

for n, v in enumerate(['accuracy', 'loss'], 0):
    for n1, t in enumerate(['N', 'D', 'S'], 0):
        p = ax[n][n1]
        p.plot(history.history[t + '_' + v][0:])
        p.plot(history.history['val_' + t + '_' +  v][0:])
        p.legend([t + '_Train', t + '_Test'], loc='center right')

plt.tight_layout()

In [None]:
np.random.seed(3)
test, _ = dataset.load_data(dataset.read_corpora(['test/modernTestCorpus/']), 0, MAXLEN)

In [None]:
model = build_model(units=800)
model.load_weights('./checkpoints/modern_over2')
x = test.normalized
y = {'N': test.niqqud, 'D': test.dagesh, 'S': test.sin }

model.compile(loss=sparse_categorical_crossentropy,
              metrics={'N': accuracy, 'D': accuracy, 'S': accuracy})

_ = model.evaluate(x=x, y=y, batch_size=64)

In [None]:
import hebrew
import dataset

In [None]:
print(data['modern'][1].normalized[0])
print(data['modern'][1].niqqud[0])

In [None]:
%env WANDB_MODE run
config = {
        'batch_size': 64,
        'units': 500,
        'order': [
            ('mix',    [(30e-4, 80e-4, 1e-4)], 'mix'),
            ('modern', [(50e-4, 50e-4, 1e-5)], 'modern'),
            ('modern', [(50e-4, 50e-4, 1e-5),
                        # (50e-4, 50e-4, 1e-5),
                       ], 'modern_over'),
        ],
    }
run = wandb.init(project="dotter",
                 # group="maxlen",
                 name=f'maxlen_test',
                 tags=['CLR', 'ordered'],
                 config=config)

with run:
    for maxlen, letters, words in [
            (75, 0.9511, 0.7778),
            (80, 0.9531, 0.7819),
            (85, 0.9535, 0.7819),
            (90, 0.9526, 0.7841),
            (95, 0.9514, 0.7795),
    ]:
        run.log({'maxlen': maxlen,
                 'letters': letters,
                 'words': words})


In [17]:
# 2 level, h=400: 5,313,223 params
# 1 level, h=400: 2,750,023 params
# 1 level, h=557: 5,316,031 params
model = build_model(units=557)
model.summary()

Model: "functional_27"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_14 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_13 (Embedding)        (None, None, 557)    23951       input_14[0][0]                   
__________________________________________________________________________________________________
bidirectional_16 (Bidirectional (None, None, 557)    4968440     embedding_13[0][0]               
__________________________________________________________________________________________________
dense_13 (Dense)                (None, None, 557)    310806      bidirectional_16[0][0]           
______________________________________________________________________________________

In [1]:
import tensorflow as tf
import tensorflowjs as tfjs

model = tf.keras.models.load_model('final_model/final.h5')
tfjs.converters.save_keras_model(model, 'final_model/')
tfjs.__version__

  return h5py.File(h5file)


'2.4.0'

In [3]:
model = build_model(400)