---
题目：Problem 3（keras解法）
---------

(difficult!)

Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:

    the quick brown fox
    
the model should attempt to output:

    eht kciuq nworb xof
    
Refer to the lecture on how to put together a sequence-to-sequence model, as well as [this article](http://arxiv.org/abs/1409.3215) for best practices.

---

### 1.下载text8

In [1]:
import os
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


### 2.构建数据集

In [2]:
import tensorflow as tf

def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


In [10]:
valid_size = 10000
train_size = 2000000
valid_text = text[:valid_size]
train_text = text[valid_size:valid_size+train_size]
train_size = len(train_text)
print(train_size, train_text[:200])
print(valid_size, valid_text[:100])

(2000000, ' collectively and that goods be distributed by need not labor an early anarchist communist was joseph d jacque the first person to describe himself as libertarian unlike proudhon he argued that it is ')
(10000, ' anarchism originated as a term of abuse first used against early working class radicals including t')


In [11]:
# 将字符转为id
import string

vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print('Unexpected character: %s' % char)
    return 0

def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print(char2id('a'), char2id('z'), char2id(' '))
print(id2char(1), id2char(26), id2char(0))

(1, 26, 0)
('a', 'z', ' ')


In [12]:
def mirror(text):
    words = text.split(" ")
    mirror = []
    for word in words:
        mirror.append(word[::-1])
    return " ".join(mirror)

print(mirror(" collectively and tha"))
print(mirror("rn border with chile "))

 ylevitcelloc dna aht
nr redrob htiw elihc 


In [13]:
seq_len = 20

def build_dataset(text):
    dataset = []
    labels = []
    length = len(text) / seq_len
    for i in range(length):
        line = text[i:i+seq_len]
        mirror_line = mirror(line)
        dataset.append([char2id(ch) for ch in line])
        labels.append([char2id(ch) for ch in mirror_line])
    return dataset, labels

train_set, train_labels = build_dataset(train_text)
valid_set, valid_labels = build_dataset(valid_text)

In [14]:
def show_string(data):
    return "".join([id2char(_id) for _id in data])

print(train_set[0], train_labels[0])
print(show_string(train_set[0]), show_string(train_labels[0]))

([0, 3, 15, 12, 12, 5, 3, 20, 9, 22, 5, 12, 25, 0, 1, 14, 4, 0, 20, 8], [0, 25, 12, 5, 22, 9, 20, 3, 5, 12, 12, 15, 3, 0, 4, 14, 1, 0, 8, 20])
(' collectively and th', ' ylevitcelloc dna ht')


In [15]:
import numpy as np

def vectorize(word, seq_len, vec_size):
    vec = np.zeros((seq_len, vec_size), dtype=int)
    for i, ch in enumerate(word):
        vec[i, ch] = 1
    return vec

def vectorize_dataset(dataset, labels):
    x = np.zeros((len(dataset), seq_len, vocabulary_size), dtype=np.int8)
    y = np.zeros((len(dataset), seq_len, vocabulary_size), dtype=np.int8)
    
    for i in range(len(dataset)):
        x[i] = vectorize(dataset[i], seq_len, vocabulary_size)
        y[i] = vectorize(labels[i], seq_len, vocabulary_size)
    return x, y

train_x, train_y = vectorize_dataset(train_set, train_labels)
valid_x, valid_y = vectorize_dataset(valid_set, valid_labels)

In [16]:
print("train_x:", train_x.shape)
print("train_y:", train_y.shape)
print("valid_x:", valid_x.shape)
print("valid_y:", valid_y.shape)

('train_x:', (100000, 20, 27))
('train_y:', (100000, 20, 27))
('valid_x:', (500, 20, 27))
('valid_y:', (500, 20, 27))


### 3.构建模型

In [17]:
from keras.layers.recurrent import GRU
from keras.layers.wrappers import TimeDistributed
from keras.models import Sequential, model_from_json
from keras.layers.core import Dense, RepeatVector

def build_model(input_size, seq_len, hidden_size):
    model = Sequential()
    model.add(GRU(input_shape=(None, input_size), units=hidden_size, return_sequences=False))
    model.add(Dense(hidden_size, activation="relu"))
    model.add(RepeatVector(seq_len))
    model.add(GRU(units=hidden_size, return_sequences=True))
    model.add(TimeDistributed(Dense(units=input_size, activation="linear")))
    model.compile(loss="mse", optimizer='adam')
    
    return model

model = build_model(vocabulary_size, seq_len, 128)

Using TensorFlow backend.


### 4.训练与评测

In [19]:
model.fit(train_x, train_y,
          batch_size=128, 
          epochs=128,
          verbose=1,
          validation_data=(valid_x, valid_y))

Train on 100000 samples, validate on 500 samples
Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
Epoch 20/128
Epoch 21/128
Epoch 22/128
Epoch 23/128
Epoch 24/128
Epoch 25/128
Epoch 26/128
Epoch 27/128
Epoch 28/128
Epoch 29/128
Epoch 30/128
Epoch 31/128
Epoch 32/128
Epoch 33/128
Epoch 34/128
Epoch 35/128
Epoch 36/128
Epoch 37/128
Epoch 38/128
Epoch 39/128
Epoch 40/128
Epoch 41/128
Epoch 42/128
Epoch 43/128
Epoch 44/128
Epoch 45/128
Epoch 46/128
Epoch 47/128
Epoch 48/128
Epoch 49/128
Epoch 50/128
Epoch 51/128
Epoch 52/128
Epoch 53/128
Epoch 54/128
Epoch 55/128
Epoch 56/128
Epoch 57/128
Epoch 58/128
Epoch 59/128
Epoch 60/128
Epoch 61/128
Epoch 62/128
Epoch 63/128
Epoch 64/128
Epoch 65/128
Epoch 66/128
Epoch 67/128
Epoch 68/128
Epoch 69/128
Epoch 70/128
Epoch 71/128
Epoch 72/128
Epoch 73/128
Epoch 74/12

<keras.callbacks.History at 0x7f8a91854b50>

In [20]:
def test_output(test_str):
    test_case = np.zeros((1, seq_len, vocabulary_size), dtype=np.int8)
    test_case[0] = vectorize([char2id(ch) for ch in test_str], seq_len, vocabulary_size)

    pred = model.predict(test_case)[0]
    print(''.join([id2char(i) for i in pred.argmax(axis=1)]))

test_output("sequence to sequence")
test_output("rn border with chile")
test_output(" collectively and th")
test_output("i am kalen hello guy")

ecnenues ot ecneunes
nr redrow htiw elihc
 ylelitcelloc dna ht
m na nelam olluh yld
