In [1]:
# https://www.kaggle.com/c/text-normalization-challenge-english-language

In [1]:
import pickle as pkl
import numpy as np
from pathlib import Path
import csv
from sklearn.model_selection import train_test_split
import itertools
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tqdm import tqdm

from keras.models import Model
from keras.layers import Input, CuDNNLSTM, Dense

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
PATH = Path('DATA')
NUM_CLASSES = 100
MAX_LEN = 15
SEQ_LEN = MAX_LEN + 2
NUM_TOKENS = NUM_CLASSES + 2

## Read data

In [3]:
x, y = [], []
with open(PATH/'kaggle_data/en_train.csv', newline='', encoding='utf8') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    for i, row in enumerate(spamreader):
        if i > 0:
            x.append(row[3])
            y.append(row[4])

print('Num examples:', len(x), len(y))
print('Num classes: ', len(set(y)))


#x_trn, x_val, y_trn, y_val = train_test_split(x, y, test_size=0.05, random_state=42)
#pkl.dump([x_trn, y_trn, x_val, y_val], open(PATH/'xt_yt_xv_yv.pkl', 'wb'))

Num examples: 9918441 9918441
Num classes:  471587


## Process data

#### Inspect data

In [50]:
print('\n############## Train data:')
for i in range(10, 15):
    print(x[i], '-', y[i])


############## Train data:
2006 - two thousand six
IUCN - i u c n
Red - Red
List - List
of - of


In [51]:
# Get longest words
print(max([len(x) for x in x]))
print(max([len(x) for x in y]))
# Clearly has some abnormally long sequences 

1057
3767


In [52]:
sorted_len_x = sorted(x, key=len)
sorted_len_y = sorted(y, key=len)

In [53]:
# The outputs contain links and very long numbers, the character based model should have around 130 max characters
print(sorted_len_x[-650])
print(sorted_len_y[-650])

http://cfpub.epa.gov/ncea/iris/index.cfm
nine trillion seven hundred eighty billion eight hundred four million seven hundred thirty seven thousand seven hundred forty six


#### Tokenize data

In [4]:
x_t = []
y_t = []

for it, (xi, yi) in tqdm(enumerate(zip(x, y))):
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    if len(xi) <= MAX_LEN and len(yi) <= MAX_LEN:
        yi = '\t' + yi + '\n'
        y_t.append(yi)
        x_t.append(xi)
            
tokenizer = Tokenizer(num_words=NUM_CLASSES, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
          lower=True, split='', char_level=True, oov_token='OOV')

tokenizer.fit_on_texts(x_t + y_t)
x_t = tokenizer.texts_to_sequences(x_t)
y_t = tokenizer.texts_to_sequences(y_t)
print(tokenizer.word_index['p'])

9918441it [00:08, 1163585.24it/s]


19


In [5]:
# Pre pad x and post pad y
x_t_p = pad_sequences(x_t, maxlen=MAX_LEN+2, dtype='int32', padding='pre', truncating='post', value=0)
y_t_p = pad_sequences(y_t, maxlen=MAX_LEN+2, dtype='int32', padding='post', truncating='post', value=0)

In [6]:
print(tokenizer.word_index['\t'])
ind=8
print(x_t_p[ind], x_t_p[ind].shape)
print(y_t_p[ind], y_t_p[ind].shape)

2
[ 0  0  0  0  0  0  5 15  5  8  6 12  5 15  4  5  4] (17,)
[ 2  5 15  5  8  6 12  5 15  4  5  4  3  0  0  0  0] (17,)


In [11]:
encoder_input_data = x_t_p
decoder_input_data = y_t_p
decoder_target_data = np.zeros_like(decoder_input_data)

for i, val in enumerate(decoder_input_data):
    decoder_target_data[i] = np.append(decoder_input_data[i][1:], np.array([0]))

# for i, val in enumerate(decoder_input_data):
#     decoder_target_data[i][1:] = decoder_input_data[i][:-1]

In [12]:
encoder_input_data = np.expand_dims(encoder_input_data, axis=-1)
decoder_input_data = np.expand_dims(decoder_input_data, axis=-1)
decoder_target_data = np.expand_dims(decoder_target_data, axis=-1)

In [13]:
print(decoder_input_data[0])
print(decoder_target_data[0])

[[ 2]
 [24]
 [10]
 [ 7]
 [13]
 [13]
 [ 5]
 [ 8]
 [ 6]
 [ 5]
 [ 7]
 [11]
 [ 7]
 [ 5]
 [ 3]
 [ 0]
 [ 0]]
[[24]
 [10]
 [ 7]
 [13]
 [13]
 [ 5]
 [ 8]
 [ 6]
 [ 5]
 [ 7]
 [11]
 [ 7]
 [ 5]
 [ 3]
 [ 0]
 [ 0]
 [ 0]]


In [13]:
np.savez(open(PATH/'pkl/enci_deci_dect.npz', 'wb'), encoder_input_data, decoder_input_data, decoder_target_data)

In [14]:
pkl.dump(tokenizer, open(PATH/'tokenizer.pkl', 'wb'))

In [129]:
# x_trn, x_val, y_trn, y_val = train_test_split(x_p_t, y_p_t, test_size=0.05, random_state=42)

In [135]:
# pkl.dump([x_trn, y_trn, x_val, y_val], open(PATH/'xt_yt_xv_yv.pkl', 'wb'), protocol=4)

## Load data

In [15]:
with open(PATH/'pkl/enci_deci_dect.npz', 'rb') as f:
    arrs  = np.load(f)
    encoder_input_data = arrs['arr_0']
    decoder_input_data = arrs['arr_1']
    decoder_target_data = arrs['arr_2']
    
with open(PATH/'tokenizer.pkl', 'rb') as f:
    tokenizer = pkl.load(f)

In [8]:
# with open(PATH/'pkl/xt_yt_xv_yv.pkl', 'rb') as f:
#     x_trn, y_trn, x_val, y_val = pkl.load(f)

## Model

In [15]:
h_size = 64
bs = 64

In [16]:
# Define an input sequence and process it.
encoder_inp = Input(shape=(SEQ_LEN, 1))
encoder = CuDNNLSTM(h_size, return_state=True)
encoder_out, state_h, state_c = encoder(encoder_inp)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inp = Input(shape=(SEQ_LEN, 1))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = CuDNNLSTM(h_size, return_sequences=True, return_state=True)
decoder_out, _, _ = decoder_lstm(decoder_inp, initial_state=encoder_states)
decoder_dense = Dense(NUM_TOKENS, activation='softmax')
decoder_out = decoder_dense(decoder_out)

Instructions for updating:
Use the retry module or similar alternatives.


In [17]:
model = Model([encoder_inp, decoder_inp], decoder_out)

In [18]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=bs,
          epochs=1,
          validation_split=0.2,
         verbose=1)

Train on 7693076 samples, validate on 1923270 samples
Epoch 1/1


 129984/7693076 [..............................] - ETA: 89:00:04 - loss: 4.637 - ETA: 12:58:19 - loss: 4.488 - ETA: 6:37:06 - loss: 4.266 - ETA: 4:30:16 - loss: 3.94 - ETA: 3:34:06 - loss: 3.60 - ETA: 2:58:18 - loss: 3.27 - ETA: 2:33:28 - loss: 2.98 - ETA: 2:12:31 - loss: 2.72 - ETA: 1:59:11 - loss: 2.53 - ETA: 1:47:00 - loss: 2.36 - ETA: 1:38:45 - loss: 2.24 - ETA: 1:30:43 - loss: 2.12 - ETA: 1:24:02 - loss: 2.02 - ETA: 1:18:30 - loss: 1.94 - ETA: 1:13:47 - loss: 1.86 - ETA: 1:10:22 - loss: 1.81 - ETA: 1:06:46 - loss: 1.75 - ETA: 1:03:40 - loss: 1.70 - ETA: 1:00:53 - loss: 1.65 - ETA: 58:27 - loss: 1.6171 - ETA: 56:34 - loss: 1.58 - ETA: 54:50 - loss: 1.55 - ETA: 53:15 - loss: 1.52 - ETA: 51:32 - loss: 1.49 - ETA: 49:58 - loss: 1.46 - ETA: 48:30 - loss: 1.44 - ETA: 47:12 - loss: 1.41 - ETA: 45:59 - loss: 1.39 - ETA: 44:52 - loss: 1.37 - ETA: 43:50 - loss: 1.35 - ETA: 42:49 - loss: 1.33 - ETA: 41:54 - loss: 1.31 - ETA: 41:03 - loss: 1.30 - ETA: 40:24 - loss: 1.28 - ETA: 39:40 - loss: 1

 264448/7693076 [>.............................] - ETA: 18:42 - loss: 0.66 - ETA: 18:41 - loss: 0.66 - ETA: 18:41 - loss: 0.65 - ETA: 18:40 - loss: 0.65 - ETA: 18:40 - loss: 0.65 - ETA: 18:39 - loss: 0.65 - ETA: 18:39 - loss: 0.65 - ETA: 18:38 - loss: 0.65 - ETA: 18:37 - loss: 0.65 - ETA: 18:36 - loss: 0.65 - ETA: 18:36 - loss: 0.65 - ETA: 18:35 - loss: 0.65 - ETA: 18:34 - loss: 0.65 - ETA: 18:34 - loss: 0.65 - ETA: 18:33 - loss: 0.65 - ETA: 18:33 - loss: 0.65 - ETA: 18:32 - loss: 0.65 - ETA: 18:31 - loss: 0.65 - ETA: 18:30 - loss: 0.64 - ETA: 18:30 - loss: 0.64 - ETA: 18:29 - loss: 0.64 - ETA: 18:29 - loss: 0.64 - ETA: 18:28 - loss: 0.64 - ETA: 18:27 - loss: 0.64 - ETA: 18:27 - loss: 0.64 - ETA: 18:26 - loss: 0.64 - ETA: 18:26 - loss: 0.64 - ETA: 18:25 - loss: 0.64 - ETA: 18:24 - loss: 0.64 - ETA: 18:24 - loss: 0.64 - ETA: 18:23 - loss: 0.64 - ETA: 18:23 - loss: 0.64 - ETA: 18:22 - loss: 0.64 - ETA: 18:22 - loss: 0.64 - ETA: 18:21 - loss: 0.64 - ETA: 18:20 - loss: 0.64 - ETA: 18:20 - 

 400128/7693076 [>.............................] - ETA: 16:58 - loss: 0.54 - ETA: 16:58 - loss: 0.54 - ETA: 16:58 - loss: 0.54 - ETA: 16:58 - loss: 0.54 - ETA: 16:57 - loss: 0.54 - ETA: 16:57 - loss: 0.54 - ETA: 16:57 - loss: 0.53 - ETA: 16:57 - loss: 0.53 - ETA: 16:56 - loss: 0.53 - ETA: 16:56 - loss: 0.53 - ETA: 16:56 - loss: 0.53 - ETA: 16:56 - loss: 0.53 - ETA: 16:55 - loss: 0.53 - ETA: 16:55 - loss: 0.53 - ETA: 16:55 - loss: 0.53 - ETA: 16:55 - loss: 0.53 - ETA: 16:55 - loss: 0.53 - ETA: 16:54 - loss: 0.53 - ETA: 16:54 - loss: 0.53 - ETA: 16:54 - loss: 0.53 - ETA: 16:54 - loss: 0.53 - ETA: 16:54 - loss: 0.53 - ETA: 16:54 - loss: 0.53 - ETA: 16:53 - loss: 0.53 - ETA: 16:53 - loss: 0.53 - ETA: 16:53 - loss: 0.53 - ETA: 16:53 - loss: 0.53 - ETA: 16:53 - loss: 0.53 - ETA: 16:53 - loss: 0.53 - ETA: 16:52 - loss: 0.53 - ETA: 16:52 - loss: 0.53 - ETA: 16:52 - loss: 0.53 - ETA: 16:52 - loss: 0.53 - ETA: 16:52 - loss: 0.53 - ETA: 16:52 - loss: 0.53 - ETA: 16:51 - loss: 0.53 - ETA: 16:51 - 

 534336/7693076 [=>............................] - ETA: 16:09 - loss: 0.47 - ETA: 16:09 - loss: 0.47 - ETA: 16:09 - loss: 0.47 - ETA: 16:09 - loss: 0.47 - ETA: 16:08 - loss: 0.47 - ETA: 16:08 - loss: 0.47 - ETA: 16:08 - loss: 0.47 - ETA: 16:08 - loss: 0.47 - ETA: 16:08 - loss: 0.47 - ETA: 16:08 - loss: 0.47 - ETA: 16:08 - loss: 0.47 - ETA: 16:08 - loss: 0.47 - ETA: 16:07 - loss: 0.47 - ETA: 16:07 - loss: 0.47 - ETA: 16:07 - loss: 0.47 - ETA: 16:07 - loss: 0.47 - ETA: 16:07 - loss: 0.47 - ETA: 16:07 - loss: 0.47 - ETA: 16:07 - loss: 0.47 - ETA: 16:06 - loss: 0.47 - ETA: 16:06 - loss: 0.47 - ETA: 16:06 - loss: 0.47 - ETA: 16:06 - loss: 0.47 - ETA: 16:06 - loss: 0.47 - ETA: 16:06 - loss: 0.47 - ETA: 16:06 - loss: 0.47 - ETA: 16:06 - loss: 0.47 - ETA: 16:05 - loss: 0.47 - ETA: 16:05 - loss: 0.47 - ETA: 16:05 - loss: 0.47 - ETA: 16:05 - loss: 0.47 - ETA: 16:05 - loss: 0.47 - ETA: 16:05 - loss: 0.47 - ETA: 16:05 - loss: 0.47 - ETA: 16:05 - loss: 0.47 - ETA: 16:04 - loss: 0.47 - ETA: 16:04 - 

 669184/7693076 [=>............................] - ETA: 15:38 - loss: 0.43 - ETA: 15:38 - loss: 0.43 - ETA: 15:38 - loss: 0.43 - ETA: 15:38 - loss: 0.43 - ETA: 15:38 - loss: 0.43 - ETA: 15:38 - loss: 0.43 - ETA: 15:38 - loss: 0.43 - ETA: 15:38 - loss: 0.43 - ETA: 15:37 - loss: 0.43 - ETA: 15:37 - loss: 0.43 - ETA: 15:37 - loss: 0.43 - ETA: 15:37 - loss: 0.43 - ETA: 15:37 - loss: 0.43 - ETA: 15:37 - loss: 0.43 - ETA: 15:37 - loss: 0.43 - ETA: 15:37 - loss: 0.43 - ETA: 15:37 - loss: 0.43 - ETA: 15:37 - loss: 0.43 - ETA: 15:37 - loss: 0.43 - ETA: 15:36 - loss: 0.43 - ETA: 15:36 - loss: 0.43 - ETA: 15:36 - loss: 0.43 - ETA: 15:36 - loss: 0.43 - ETA: 15:36 - loss: 0.43 - ETA: 15:36 - loss: 0.43 - ETA: 15:36 - loss: 0.43 - ETA: 15:36 - loss: 0.43 - ETA: 15:36 - loss: 0.43 - ETA: 15:36 - loss: 0.43 - ETA: 15:36 - loss: 0.43 - ETA: 15:35 - loss: 0.43 - ETA: 15:35 - loss: 0.43 - ETA: 15:35 - loss: 0.43 - ETA: 15:35 - loss: 0.43 - ETA: 15:35 - loss: 0.43 - ETA: 15:35 - loss: 0.43 - ETA: 15:35 - 

 804864/7693076 [==>...........................] - ETA: 15:12 - loss: 0.40 - ETA: 15:12 - loss: 0.40 - ETA: 15:12 - loss: 0.40 - ETA: 15:12 - loss: 0.40 - ETA: 15:12 - loss: 0.40 - ETA: 15:11 - loss: 0.40 - ETA: 15:11 - loss: 0.40 - ETA: 15:11 - loss: 0.40 - ETA: 15:11 - loss: 0.40 - ETA: 15:11 - loss: 0.40 - ETA: 15:11 - loss: 0.40 - ETA: 15:11 - loss: 0.40 - ETA: 15:11 - loss: 0.40 - ETA: 15:11 - loss: 0.40 - ETA: 15:11 - loss: 0.40 - ETA: 15:11 - loss: 0.40 - ETA: 15:11 - loss: 0.40 - ETA: 15:10 - loss: 0.40 - ETA: 15:10 - loss: 0.40 - ETA: 15:10 - loss: 0.40 - ETA: 15:10 - loss: 0.40 - ETA: 15:10 - loss: 0.40 - ETA: 15:10 - loss: 0.40 - ETA: 15:10 - loss: 0.40 - ETA: 15:10 - loss: 0.40 - ETA: 15:10 - loss: 0.40 - ETA: 15:10 - loss: 0.40 - ETA: 15:10 - loss: 0.40 - ETA: 15:10 - loss: 0.40 - ETA: 15:10 - loss: 0.39 - ETA: 15:10 - loss: 0.39 - ETA: 15:09 - loss: 0.39 - ETA: 15:09 - loss: 0.39 - ETA: 15:09 - loss: 0.39 - ETA: 15:09 - loss: 0.39 - ETA: 15:09 - loss: 0.39 - ETA: 15:09 - 

 940992/7693076 [==>...........................] - ETA: 14:48 - loss: 0.37 - ETA: 14:48 - loss: 0.37 - ETA: 14:48 - loss: 0.37 - ETA: 14:48 - loss: 0.37 - ETA: 14:48 - loss: 0.37 - ETA: 14:48 - loss: 0.37 - ETA: 14:48 - loss: 0.37 - ETA: 14:48 - loss: 0.37 - ETA: 14:48 - loss: 0.37 - ETA: 14:48 - loss: 0.37 - ETA: 14:48 - loss: 0.37 - ETA: 14:47 - loss: 0.37 - ETA: 14:47 - loss: 0.37 - ETA: 14:47 - loss: 0.37 - ETA: 14:47 - loss: 0.37 - ETA: 14:47 - loss: 0.37 - ETA: 14:47 - loss: 0.37 - ETA: 14:47 - loss: 0.37 - ETA: 14:47 - loss: 0.37 - ETA: 14:47 - loss: 0.37 - ETA: 14:47 - loss: 0.37 - ETA: 14:47 - loss: 0.37 - ETA: 14:46 - loss: 0.37 - ETA: 14:46 - loss: 0.37 - ETA: 14:46 - loss: 0.37 - ETA: 14:46 - loss: 0.37 - ETA: 14:46 - loss: 0.37 - ETA: 14:46 - loss: 0.37 - ETA: 14:46 - loss: 0.37 - ETA: 14:46 - loss: 0.37 - ETA: 14:46 - loss: 0.37 - ETA: 14:46 - loss: 0.37 - ETA: 14:46 - loss: 0.37 - ETA: 14:46 - loss: 0.37 - ETA: 14:46 - loss: 0.37 - ETA: 14:46 - loss: 0.37 - ETA: 14:45 - 

1076608/7693076 [===>..........................] - ETA: 14:26 - loss: 0.35 - ETA: 14:26 - loss: 0.35 - ETA: 14:26 - loss: 0.35 - ETA: 14:26 - loss: 0.35 - ETA: 14:26 - loss: 0.35 - ETA: 14:26 - loss: 0.35 - ETA: 14:26 - loss: 0.35 - ETA: 14:26 - loss: 0.35 - ETA: 14:26 - loss: 0.35 - ETA: 14:26 - loss: 0.35 - ETA: 14:26 - loss: 0.35 - ETA: 14:25 - loss: 0.35 - ETA: 14:25 - loss: 0.35 - ETA: 14:25 - loss: 0.35 - ETA: 14:25 - loss: 0.35 - ETA: 14:25 - loss: 0.35 - ETA: 14:25 - loss: 0.35 - ETA: 14:25 - loss: 0.35 - ETA: 14:25 - loss: 0.35 - ETA: 14:25 - loss: 0.35 - ETA: 14:25 - loss: 0.35 - ETA: 14:25 - loss: 0.35 - ETA: 14:25 - loss: 0.35 - ETA: 14:25 - loss: 0.35 - ETA: 14:24 - loss: 0.35 - ETA: 14:24 - loss: 0.35 - ETA: 14:24 - loss: 0.35 - ETA: 14:24 - loss: 0.35 - ETA: 14:24 - loss: 0.35 - ETA: 14:24 - loss: 0.35 - ETA: 14:24 - loss: 0.35 - ETA: 14:24 - loss: 0.35 - ETA: 14:24 - loss: 0.35 - ETA: 14:24 - loss: 0.35 - ETA: 14:24 - loss: 0.35 - ETA: 14:24 - loss: 0.35 - ETA: 14:24 - 

1212224/7693076 [===>..........................] - ETA: 14:06 - loss: 0.33 - ETA: 14:06 - loss: 0.33 - ETA: 14:06 - loss: 0.33 - ETA: 14:06 - loss: 0.33 - ETA: 14:06 - loss: 0.33 - ETA: 14:06 - loss: 0.33 - ETA: 14:05 - loss: 0.33 - ETA: 14:05 - loss: 0.33 - ETA: 14:05 - loss: 0.33 - ETA: 14:05 - loss: 0.33 - ETA: 14:05 - loss: 0.33 - ETA: 14:05 - loss: 0.33 - ETA: 14:05 - loss: 0.33 - ETA: 14:05 - loss: 0.33 - ETA: 14:05 - loss: 0.33 - ETA: 14:05 - loss: 0.33 - ETA: 14:05 - loss: 0.33 - ETA: 14:05 - loss: 0.33 - ETA: 14:05 - loss: 0.33 - ETA: 14:05 - loss: 0.33 - ETA: 14:04 - loss: 0.33 - ETA: 14:04 - loss: 0.33 - ETA: 14:04 - loss: 0.33 - ETA: 14:04 - loss: 0.33 - ETA: 14:04 - loss: 0.33 - ETA: 14:04 - loss: 0.33 - ETA: 14:04 - loss: 0.33 - ETA: 14:04 - loss: 0.33 - ETA: 14:04 - loss: 0.33 - ETA: 14:04 - loss: 0.33 - ETA: 14:04 - loss: 0.33 - ETA: 14:04 - loss: 0.33 - ETA: 14:04 - loss: 0.33 - ETA: 14:04 - loss: 0.33 - ETA: 14:04 - loss: 0.33 - ETA: 14:03 - loss: 0.33 - ETA: 14:03 - 

1347712/7693076 [====>.........................] - ETA: 13:47 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:46 - loss: 0.31 - ETA: 13:45 - loss: 0.31 - ETA: 13:45 - loss: 0.31 - ETA: 13:45 - loss: 0.31 - ETA: 13:45 - loss: 0.31 - ETA: 13:45 - loss: 0.31 - ETA: 13:45 - loss: 0.31 - ETA: 13:45 - loss: 0.31 - ETA: 13:45 - loss: 0.31 - ETA: 13:45 - loss: 0.31 - ETA: 13:45 - loss: 0.31 - ETA: 13:45 - loss: 0.31 - ETA: 13:45 - loss: 0.31 - ETA: 13:45 - loss: 0.31 - ETA: 13:45 - loss: 0.31 - ETA: 13:45 - loss: 0.31 - ETA: 13:44 - loss: 0.31 - ETA: 13:44 - loss: 0.31 - ETA: 13:44 - loss: 0.31 - ETA: 13:44 - loss: 0.31 - ETA: 13:44 - 

1482752/7693076 [====>.........................] - ETA: 13:27 - loss: 0.30 - ETA: 13:27 - loss: 0.30 - ETA: 13:27 - loss: 0.30 - ETA: 13:27 - loss: 0.30 - ETA: 13:27 - loss: 0.30 - ETA: 13:27 - loss: 0.30 - ETA: 13:27 - loss: 0.30 - ETA: 13:27 - loss: 0.30 - ETA: 13:27 - loss: 0.30 - ETA: 13:27 - loss: 0.30 - ETA: 13:27 - loss: 0.30 - ETA: 13:27 - loss: 0.30 - ETA: 13:27 - loss: 0.30 - ETA: 13:27 - loss: 0.30 - ETA: 13:27 - loss: 0.30 - ETA: 13:27 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:26 - loss: 0.30 - ETA: 13:25 - loss: 0.30 - ETA: 13:25 - loss: 0.30 - ETA: 13:25 - loss: 0.30 - ETA: 13:25 - loss: 0.30 - ETA: 13:25 - 

1618624/7693076 [=====>........................] - ETA: 13:09 - loss: 0.29 - ETA: 13:09 - loss: 0.29 - ETA: 13:09 - loss: 0.29 - ETA: 13:09 - loss: 0.29 - ETA: 13:09 - loss: 0.29 - ETA: 13:09 - loss: 0.29 - ETA: 13:09 - loss: 0.29 - ETA: 13:09 - loss: 0.29 - ETA: 13:09 - loss: 0.29 - ETA: 13:09 - loss: 0.29 - ETA: 13:09 - loss: 0.29 - ETA: 13:08 - loss: 0.29 - ETA: 13:08 - loss: 0.29 - ETA: 13:08 - loss: 0.29 - ETA: 13:08 - loss: 0.29 - ETA: 13:08 - loss: 0.29 - ETA: 13:08 - loss: 0.29 - ETA: 13:08 - loss: 0.29 - ETA: 13:08 - loss: 0.29 - ETA: 13:08 - loss: 0.29 - ETA: 13:08 - loss: 0.29 - ETA: 13:08 - loss: 0.29 - ETA: 13:08 - loss: 0.29 - ETA: 13:08 - loss: 0.29 - ETA: 13:08 - loss: 0.29 - ETA: 13:08 - loss: 0.29 - ETA: 13:07 - loss: 0.29 - ETA: 13:07 - loss: 0.29 - ETA: 13:07 - loss: 0.29 - ETA: 13:07 - loss: 0.29 - ETA: 13:07 - loss: 0.29 - ETA: 13:07 - loss: 0.29 - ETA: 13:07 - loss: 0.29 - ETA: 13:07 - loss: 0.29 - ETA: 13:07 - loss: 0.29 - ETA: 13:07 - loss: 0.29 - ETA: 13:07 - 

1751232/7693076 [=====>........................] - ETA: 12:51 - loss: 0.28 - ETA: 12:51 - loss: 0.28 - ETA: 12:51 - loss: 0.28 - ETA: 12:50 - loss: 0.28 - ETA: 12:50 - loss: 0.28 - ETA: 12:50 - loss: 0.28 - ETA: 12:50 - loss: 0.28 - ETA: 12:50 - loss: 0.28 - ETA: 12:50 - loss: 0.28 - ETA: 12:50 - loss: 0.28 - ETA: 12:50 - loss: 0.28 - ETA: 12:50 - loss: 0.28 - ETA: 12:50 - loss: 0.28 - ETA: 12:50 - loss: 0.28 - ETA: 12:50 - loss: 0.28 - ETA: 12:50 - loss: 0.28 - ETA: 12:50 - loss: 0.28 - ETA: 12:50 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - loss: 0.28 - ETA: 12:49 - 



























































































<keras.callbacks.History at 0x2e143ae0b38>

In [9]:
model.save(PATH/'models/1epoch.h5',)

  '. They will not be included '


In [22]:
model.save_weights(PATH/'models/1epochw.h5')

In [14]:
model.load_weights(PATH/'models/1epochw.h5')

In [19]:
# For inference
encoder_model = Model(encoder_inp, encoder_states)

decoder_s_inp_h = Input(shape=(h_size,))
decoder_s_inp_c = Input(shape=(h_size,))
decoder_s_inps = [decoder_s_inp_h, decoder_s_inp_c]
decoder_out, s_h, s_c = decoder_lstm(decoder_inp, initial_state=decoder_s_inps)
decoder_s = [s_h, s_c]
decoder_out = decoder_dense(decoder_out)
decoder_model = Model([decoder_inp] + decoder_s_inps, [decoder_out] + decoder_s)

In [37]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, SEQ_LEN, 1))
    # Populate the first character of target sequence with the start character.
    target_seq[0] = tokenizer.word_index['\t']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
#         import pdb;pdb.set_trace()
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        if sampled_token_index != 0:
            sampled_char = reverse_word_map[sampled_token_index] #
        else:
            sampled_char = '0'
        decoded_sentence += sampled_char
        
        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > SEQ_LEN):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq =  np.zeros((1, SEQ_LEN, 1))
        target_seq[0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [38]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', x[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: Brillantaisia
Decoded sentence: OOVOOVOOVOOVOOVOOV
-
Input sentence: is
Decoded sentence: OOVOOVOOVOOVOOVOOV
-
Input sentence: a
Decoded sentence: OOVOOVOOVOOVOOVOOV
-
Input sentence: genus
Decoded sentence: dtceeeeeeeeeeeeeee
-
Input sentence: of
Decoded sentence: OOVOOVOOVOOVOOVOOV
-
Input sentence: plant
Decoded sentence: dtodeeeeeeeeeeeeee
-
Input sentence: in
Decoded sentence: OOVOOVOOVOOVOOVOOV
-
Input sentence: family
Decoded sentence: OOVOOVOOVOOVOOVOOV
-
Input sentence: Acanthaceae
Decoded sentence: OOVOOVOOVOOVOOVOOV
-
Input sentence: .
Decoded sentence: OOVOOVOOVOOVOOVOOV
-
Input sentence: 2006
Decoded sentence: OOVOOVOOVOOVOOVOOV
-
Input sentence: IUCN
Decoded sentence: OOVOOVOOVOOVOOVOOV
-
Input sentence: Red
Decoded sentence: OOVOOVOOVaonaeeeee
-
Input sentence: List
Decoded sentence: OOVOOVOOVOOVOOVOOV
-
Input sentence: of
Decoded sentence: pfzeeeeeeeeeeeeeee
-
Input sentence: Threatened
Decoded sentence: OOVOOVOOVOOVOOVOOV
-
Input sentence: Species
Dec

In [27]:
tokenizer.word_index['t']

5

In [28]:
decoded_sentence

NameError: name 'decoded_sentence' is not defined

# Notes

- Technically shouldn't lower case tokens
- Use an embedding layer perhaps
- decoder_target_data is wrong, shoudln't have start token