In [1]:
# ignore some Keras warnings regarding deprecations and model saving 
import warnings
warnings.filterwarnings('ignore')

from keras.models import Model, load_model, Sequential
from keras.layers import Input, LSTM, Dense, RepeatVector, \
                         TimeDistributed, Activation
from keras.callbacks import EarlyStopping
import pickle

from helpers import *


Using TensorFlow backend.


Sentences from the [tatoeba dataset](https://tatoeba.org/eng/downloads)

In [2]:
epochs = 100  # Number of epochs to train for.
num_samples =220000 # Number of samples to train on.

# Path to the txt file on disk.
data_path = 'sentences.txt'

epochs = 100
noise = .05
misspellings_count = 3
batch_size = 128  # Batch size for training.

optimizer= 'adam'
loss_fn='categorical_crossentropy'


In [3]:
# Hand-pick maximum sequence lengths
max_seq_length = 25 # max (allowed) input sequence length
max_target_seq_length = max_seq_length + 2 # accomodate for the delimiters = 30

In [4]:
def load_preprocessed(data_path, max_len):
    """Dirty helper fn loading a file from disk, doing some basic preprocessing
        and filtering out phrases that are longer than our maximum sequence length"""
    with open(data_path) as f:
        lines = f.readlines()
    lines = text_preprocess(lines)
    # allow only for a limited count of 
    allowed_chars = set(' !"#$%&\'()+,-./0123456789:;?[]_`abcdefghijklmnopqrstuvwxyz{}')
    selected = []
    for l in lines:
        if all([c in allowed_chars for c in l.strip()]) and \
           len(l) <= max_len:
            selected.append(l)
    # suffle deterministically
    Random(0).shuffle(selected)
    return selected

In [5]:
all_phrases = load_preprocessed(data_path, max_seq_length)
assert len(all_phrases) > num_samples
train_phrases = all_phrases[:num_samples]
test_phrases = all_phrases[num_samples:]
print('All phrases in dataset: ', len(all_phrases))
print('Training phrases: ', len(train_phrases))
print('Test phrases: ', len(test_phrases))

print("\n * ".join(['Examples:'] + all_phrases[:10]))

All phrases in dataset:  239820
Training phrases:  220000
Test phrases:  19820
Examples:
 * tom is reading a book now
 * tom is quite strong
 * what did you just do?
 * tom unloaded the car
 * you aren't hurt
 * i had a hard time
 * tom actually likes me
 * i'm loaded
 * i began working
 * how was the interview?


In [6]:
# create doken indices out of all phrases
token_idx = token_index(all_phrases + ['\t', '\n'])
# ^^ \t and \n are our [START] and [END] delimiters. With this trick
# we are adding them to the token index

num_encoder_tokens = len(token_idx)

print('Number of unique tokens:', num_encoder_tokens)

Number of unique tokens: 57


In [7]:
def simple_lstm(output_len, token_count):
    """Generate the model"""
    latent_dim = 256  # Latent dimensionality of the encoding space.
    initializer = 'he_normal'

    # "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE
    # note: in a situation where your input sequences have a variable length,
    # use input_shape=(None, nb_feature).
    encoder = LSTM(latent_dim, input_shape=(None, token_count), 
                   kernel_initializer=initializer)

    # For the decoder's input, we repeat the encoded input for each time step
    repeater = RepeatVector(output_len)

    decoder = LSTM(latent_dim, return_sequences=True, kernel_initializer=initializer)

    # For each of step of the output sequence, decide which character should be chosen
    time_dist = TimeDistributed(Dense(token_count, kernel_initializer=initializer))
    activation = Activation('softmax')
    
    model = Sequential([encoder, repeater, decoder, time_dist, activation])

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


Train few epochs on an identity fn with a chunk of the dataset for sanity

In [8]:
misspellings, correct = create_misspellings(train_phrases,
                                            noise, misspellings_count,
                                            max_seq_length)

X = vectorize_batch(misspellings, token_idx,
                    max_seq_length, dtype=np.bool)
Y = vectorize_batch(wrap_with_delims(correct), token_idx,
                    max_target_seq_length, dtype=np.bool)

In [None]:
model = simple_lstm(max_target_seq_length, len(token_idx))
early_stopping = EarlyStopping(monitor='val_loss', min_delta=.005, 
                               patience=3, verbose=0, mode='auto')
model.fit(X,Y, batch_size=batch_size, epochs=epochs, validation_split=.08,
         callbacks= [early_stopping])

Train on 809600 samples, validate on 70400 samples
Epoch 1/100

In [None]:
plot_history(model.history)

In [None]:
first_model = model

In [None]:
def translator_fn(model, token_index, max_seq_len):
    inverse_token_index = {v: k for k, v in token_index.items()}
    def predict(in_phrase):
        x = vectorize_phrase(in_phrase, token_index, max_seq_len)
        pred_idxes = model.predict_classes(x, verbose=0)[0]
        txt = ''.join([inverse_token_index[i] for i in pred_idxes])
        end_idx = txt.find("\n")
        return txt[1:end_idx]
    return predict

def evaluate_correct(texts, corrector):
    errors = 0.0
    for t in texts:
        if t != corrector(t): errors += 1
    return errors / len(texts)

def evaluate_misspelled(texts, corrector):
    errors = 0.0
    for t in texts:
        errored = add_noise_to_string(t, 0.05)
        if t != corrector(errored): errors += 1
    return errors / len(texts)

In [None]:
def training_vectorizer_fn(token_index, max_encoder_seq_length,
                           max_decoder_seq_length):
    """Create a closure fn for vectorization that "knows" the 
    token index and seq lengths""" 
    def training_vectorizer(input_texts, target_texts):
        X = vectorize_batch(input_texts, token_index,
                            max_encoder_seq_length, dtype=np.bool)
        Y = vectorize_batch(target_texts, token_index,
                            max_decoder_seq_length, dtype=np.bool)
        return X, Y

    return training_vectorizer

# Create a training_vectorizer that only accepts input and target texts
training_vectorizer = training_vectorizer_fn(token_idx, max_seq_length,
                                             max_target_seq_length)

In [None]:
def batched_gen(phrases, chunk_size, misspellings_count, noise, max_txt_len):
    """Goes through the given phrases, in `chunk_size` chunks, generating 
    `misspellings_count` misspelling allongside them.
    On each iteration it yields `batch_size`* (1+ misspellings_count) strings: 
    the original strings and the misspellings generated out of them"""
    for i in range(0, len(phrases), chunk_size):
        frrom = i
        to = i+chunk_size
        yield create_misspellings(phrases[frrom:to], noise, misspellings_count,
                                  max_txt_len)

In [None]:
tst = batched_gen(train_phrases, 2, 2, 0.07, max_seq_length)
inp, trgt = next(tst)
for i,t in zip(inp, trgt):
    print(i, '->',t)
    
_ = next(tst)

In [None]:
def vectorized_gen(phrases, batch_size, misspellings_count, noise,
                   max_txt_len,training_vectorizer):
    """Creates vextorized batches of phrases (that are wrapped with delims)"""
    # Create a generator of misspelled strings from the input phrases
    gen = batched_gen(phrases, batch_size, misspellings_count, noise, max_txt_len)
    
    # Go through all the input phrases, generatiing misspellings, vectorizing them
    # and yielding each batch
    for input_phrases, target_phrases in gen:
        target_phrases = wrap_with_delims(target_phrases)
        X, Y = training_vectorizer(input_phrases, target_phrases)
        # Yield the data in a X, Y form
        yield (X, Y)

In [None]:
chunk_size = 2000  # <- best if it divides `num_samples`

# create a final generator holding all the context
def training_generator():
    """Keep iterating over the training set in chunks"""
    while True:
        gen = vectorized_gen(train_phrases, chunk_size,
                             misspellings_count, noise,
                             max_seq_length,
                             training_vectorizer)
        yield from gen

steps_per_epoch = (len(train_phrases) / chunk_size)

In [None]:
# For validation, just use a fixed set of examples, not a generator
val_wrong, val_right = create_misspellings(test_phrases[:1000], noise, 3, max_seq_length)
wrapped_val_phrases = wrap_with_delims(val_right)
val_X = vectorize_batch(val_wrong, token_idx, max_seq_length, dtype=np.bool)
val_Y = vectorize_batch(wrapped_val_phrases, token_idx, max_target_seq_length, dtype=np.bool)

In [None]:
gen = training_generator()
model = simple_lstm(max_target_seq_length, len(token_idx))
early_stopping = EarlyStopping(monitor='val_loss', min_delta=.005, 
                               patience=3, verbose=0, mode='auto')

model.fit_generator(training_generator(), validation_data=(val_X, val_Y),
                    steps_per_epoch=steps_per_epoch,
                    verbose=2, max_queue_size=3, epochs=epochs,
                   callbacks= [early_stopping])

In [None]:
plot_history(model.history)

In [None]:
def translator_fn(model, token_index, max_seq_len):
    inverse_token_index = {v: k for k, v in token_index.items()}
    def predict(in_phrase):
        x = vectorize_phrase(in_phrase, token_index, max_seq_len)
        pred_idxes = model.predict_classes(x, verbose=0)[0]
        txt = ''.join([inverse_token_index[i] for i in pred_idxes])
        end_idx = txt.find("\n")
        return txt[1:end_idx]
    return predict

def evaluate_correct(texts, corrector):
    errors = 0.0
    for t in texts:
        if t != corrector(t): errors += 1
    return errors / len(texts)

def evaluate_misspelled(texts, corrector):
    errors = 0.0
    for t in texts:
        errored = add_noise_to_string(t, 0.05)
        if t != corrector(errored): errors += 1
    return errors / len(texts)


In [None]:
corrector = translator_fn(model, token_idx, max_target_seq_length)

In [None]:
[(corrector(p), p, corrector(p) == p) for p in train_phrases[:20]]

In [None]:
print(evaluate_correct(train_phrases[:1000], corrector))
print(evaluate_correct(train_phrases[-1000:], corrector))
print(evaluate_misspelled(train_phrases[:1000], corrector))
print(evaluate_correct(test_phrases[:1000], corrector))
print(evaluate_misspelled(test_phrases[:1000], corrector))

In [None]:
plot_history(model.history)

0.846

0.788

0.859

In [80]:
# find max encoder seq legth
#max_encoder_seq_length = encoder_model.get_layer('encoder_inputs').input_shape[-1]
phrases = ['fire', 'stp', 'comein', 'get ot', 'i cant go','im sorry', 
           'h is busi', 'hes drunk', 'ill be lat', 'hold mi beer', 'pus the buton', 
          'coll me on my phone', 'helo boys and girls']

[corrector(phrase) for phrase in phrases]

['fire',
 'ss?',
 'comen',
 'get o',
 'i cant  ',
 "i'm srry",
 'he is bui',
 'hess rruk',
 "i'll be tt",
 'hold my berr',
 'pust the butn',
 'coll me on my pone',
 'hellooyssannggrrl']

In [84]:
def save(fname):
    """quick-n-dirty helper for saving models"""
    print("Saving model")
    model.save(fname + '.h5')

    model_metadata = { 'token_index': token_index, 
                       'max_seq_length': max_seq_length,
                       'max_encoder_seq_length': max_target_seq_length }

    with open(fname + '_metadata.pickle', 'wb') as f:
        pickle.dump(model_metadata, f)

In [85]:
save('spelling_with_generator')

Saving model
