In [None]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
with open('../input/cipher.txt','r') as f:
    data = f.read()
    source = data.split('\n')
with open('../input/plaintext.txt','r') as f:
    data = f.read()
    target = data.split('\n')

In [18]:
print(source[:3])
print(target[:3])
# 密文-明文 一一对应

['YMJ QNRJ NX MJW QJFXY QNPJI KWZNY , GZY YMJ GFSFSF NX RD QJFXY QNPJI .', 'MJ XFB F TQI DJQQTB YWZHP .', 'NSINF NX WFNSD IZWNSL OZSJ , FSI NY NX XTRJYNRJX BFWR NS STAJRGJW .']
['THE LIME IS HER LEAST LIKED FRUIT , BUT THE BANANA IS MY LEAST LIKED .', 'HE SAW A OLD YELLOW TRUCK .', 'INDIA IS RAINY DURING JUNE , AND IT IS SOMETIMES WARM IN NOVEMBER .']


In [None]:
def tokenize(x):
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer # text_tokenized, text_tokenizer

1.
```
keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)
```
- word_index, 0 is a reserved index that won't be assigned to any word.

2.
```
tokenizer.fit_on_texts() 
```
-  Required before using `texts_to_sequences` or `texts_to_matrix`.

3.
```
tokenizer.texts_to_sequences()
```
- return a list

4.
```
tokenizer.word_index()
```
- return a dict {origin word ：idx}, {' ': 1, 'e': 2, 'o': 3, 't': 4, 'i': 5} idx 0 reserved.

In [19]:
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']

text_tokenized, text_tokenizer = tokenize(text_sentences)

In [20]:
print(text_tokenizer.word_index)

{' ': 1, 'e': 2, 'o': 3, 't': 4, 'i': 5, 's': 6, 'h': 7, 'r': 8, 'y': 9, 'u': 10, 'c': 11, 'n': 12, 'a': 13, 'p': 14, '.': 15, 'q': 16, 'k': 17, 'b': 18, 'w': 19, 'f': 20, 'x': 21, 'j': 22, 'm': 23, 'v': 24, 'l': 25, 'z': 26, 'd': 27, 'g': 28, ',': 29}


In [21]:
for i, (x, tk_x) in enumerate(zip(text_sentences,text_tokenized)):
    print('sample{}'.format(i+1))
    print('source: {}'.format(x))
    print('tk: {}'.format(tk_x))

sample1
source: The quick brown fox jumps over the lazy dog .
tk: [4, 7, 2, 1, 16, 10, 5, 11, 17, 1, 18, 8, 3, 19, 12, 1, 20, 3, 21, 1, 22, 10, 23, 14, 6, 1, 3, 24, 2, 8, 1, 4, 7, 2, 1, 25, 13, 26, 9, 1, 27, 3, 28, 1, 15]
sample2
source: By Jove , my quick study of lexicography won a prize .
tk: [18, 9, 1, 22, 3, 24, 2, 1, 29, 1, 23, 9, 1, 16, 10, 5, 11, 17, 1, 6, 4, 10, 27, 9, 1, 3, 20, 1, 25, 2, 21, 5, 11, 3, 28, 8, 13, 14, 7, 9, 1, 19, 3, 12, 1, 13, 1, 14, 8, 5, 26, 2, 1, 15]
sample3
source: This is a short sentence .
tk: [4, 7, 5, 6, 1, 5, 6, 1, 13, 1, 6, 7, 3, 8, 4, 1, 6, 2, 12, 4, 2, 12, 11, 2, 1, 15]


In [22]:
def pad(x, length=None):
    if length == None:
        length = max([len(i) for i in x])
    return pad_sequences(x, maxlen=length, padding='post')

```
keras.preprocessing.sequence.pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.0)
```
- padding: String, 'pre' or 'post': pad either before or after each sequence.
- truncating: String, 'pre' or 'post': remove values from sequences larger than  maxlen, either at the beginning or at the end of the sequences.
- value: Float or String, padding value.

In [23]:
test_pad = pad(text_tokenized)
test_pad

array([[ 4,  7,  2,  1, 16, 10,  5, 11, 17,  1, 18,  8,  3, 19, 12,  1,
        20,  3, 21,  1, 22, 10, 23, 14,  6,  1,  3, 24,  2,  8,  1,  4,
         7,  2,  1, 25, 13, 26,  9,  1, 27,  3, 28,  1, 15,  0,  0,  0,
         0,  0,  0,  0,  0,  0],
       [18,  9,  1, 22,  3, 24,  2,  1, 29,  1, 23,  9,  1, 16, 10,  5,
        11, 17,  1,  6,  4, 10, 27,  9,  1,  3, 20,  1, 25,  2, 21,  5,
        11,  3, 28,  8, 13, 14,  7,  9,  1, 19,  3, 12,  1, 13,  1, 14,
         8,  5, 26,  2,  1, 15],
       [ 4,  7,  5,  6,  1,  5,  6,  1, 13,  1,  6,  7,  3,  8,  4,  1,
         6,  2, 12,  4,  2, 12, 11,  2,  1, 15,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0]], dtype=int32)

In [24]:
def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x) # word -> num
    preprocess_y, y_tk = tokenize(y)
    
    preprocess_x = pad(preprocess_x) # pad 0
    preprocess_y = pad(preprocess_y) # (10001, 101)
    
    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*(preprocess_y.shape), 1) # (10001, 101, 1)
    # 一个sample/sentence自己一堆，一堆里面有多少个子母就有多少行，一个子母一行，一行就它自己一个num
    return preprocess_x, preprocess_y, x_tk, y_tk

In [25]:
pre_source, pre_target, tk_source, tk_target = preprocess(source, target)

In [26]:
pre_target.shape

(10001, 101, 1)

In [32]:
pre_target

array([[[ 5],
        [14],
        [ 3],
        ...,
        [ 0],
        [ 0],
        [ 0]],

       [[14],
        [ 3],
        [ 1],
        ...,
        [ 0],
        [ 0],
        [ 0]],

       [[ 2],
        [ 7],
        [11],
        ...,
        [ 0],
        [ 0],
        [ 0]],

       ...,

       [[ 5],
        [14],
        [ 3],
        ...,
        [ 0],
        [ 0],
        [ 0]],

       [[ 5],
        [14],
        [ 3],
        ...,
        [ 0],
        [ 0],
        [ 0]],

       [[ 0],
        [ 0],
        [ 0],
        ...,
        [ 0],
        [ 0],
        [ 0]]], dtype=int32)

In [30]:
print(pre_target.reshape(10001,101))

[[ 5 14  3 ...  0  0  0]
 [14  3  1 ...  0  0  0]
 [ 2  7 11 ...  0  0  0]
 ...
 [ 5 14  3 ...  0  0  0]
 [ 5 14  3 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]


In [33]:
from keras.layers import GRU, Input, Dense, TimeDistributed
from keras.models import Model
from keras.layers import Activation
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [34]:
def simple_model(input_shape, output_sequence_length, source_vocab_size, target_vocab_size):
              #（ (10001, 101, 1), 101, 32, 32）
    lr = 1e-3
    input_seq = Input(input_shape[1:]) 
    # (101, 1), 一列， 每一行代表一个子母，作为一个timestep的输入
    # 不能直接使用preprocess_x（10001，101），这样的形式不能作为输入
    rnn = GRU(64, return_sequences=True)(input_seq)
    logits = TimeDistributed(Dense(target_vocab_size))(rnn)
    
    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss=sparse_categorical_crossentropy, 
                  optimizer=Adam(lr),
                  metrics=['accuracy'])
    return model

In [60]:
pre_source.shape

(10001, 101)

In [36]:
pre_target.shape

(10001, 101, 1)

In [37]:
pre_target.shape[1]

101

In [38]:
pre_target.shape[-2]

101

In [59]:
# Reshaping the input to work with a basic RNN
tmp_x = pad(pre_source, pre_target.shape[1]) # 输入句子pad成和输出句子一样的长度，此处不必要
tmp_x = tmp_x.reshape((-1, pre_target.shape[-2], 1)) # 和前面reshape preprocess_y的操作一样
print(pre_source.shape,' -> ',tmp_x.shape)
print(tmp_x.shape[1:])

(10001, 101)  ->  (10001, 101, 1)
(101, 1)


In [41]:
tmp_x

array([[[ 5],
        [14],
        [ 3],
        ...,
        [ 0],
        [ 0],
        [ 0]],

       [[14],
        [ 3],
        [ 1],
        ...,
        [ 0],
        [ 0],
        [ 0]],

       [[ 2],
        [ 7],
        [11],
        ...,
        [ 0],
        [ 0],
        [ 0]],

       ...,

       [[ 5],
        [14],
        [ 3],
        ...,
        [ 0],
        [ 0],
        [ 0]],

       [[ 5],
        [14],
        [ 3],
        ...,
        [ 0],
        [ 0],
        [ 0]],

       [[ 0],
        [ 0],
        [ 0],
        ...,
        [ 0],
        [ 0],
        [ 0]]], dtype=int32)

In [54]:
tmp_x[:1].shape # tmp_x[:1, :, :]， 选择第一个元素，即第一个sample/sentence

(1, 101, 1)

In [61]:
# Train the neural network
simple_rnn_model = simple_model(
    tmp_x.shape, # (10001, 101, 1)
    pre_target.shape[1], # 101 of (10001, 101, 1)
    len(tk_source.word_index)+1, # 31+1 = 32
    len(tk_target.word_index)+1)

In [62]:
simple_rnn_model.fit(tmp_x, pre_target, batch_size=32, epochs=5, validation_split=0.2)

Train on 8000 samples, validate on 2001 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f2f937fb128>

In [63]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], tk_target))
# tmp_x[:1] = tmp_x[:1, :, :]

`logits_to_text` function loaded.
t h e   l i m e   i s   h e r   l e a s t   l i k e d   f r u i t   ,   b u t   t h e   g a n a n a   i s   m y   l e a s t   l i k e d   . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
