In [1]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, RNN, TimeDistributed
from keras.optimizers import *
from keras.activations import *
import keras.backend as K

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

### import ข้อมูลชื่อ ไดโนเสาร์

```
Aachenosaurus
Aardonyx
Abdallahsaurus
Abelisaurus
Abrictosaurus
...
Zuolong
Zuoyunlong
Zupaysaurus
Zuul
```

In [2]:
with open("./data/dinos.txt", "r") as f:
    dinos = f.read()
dinos = dinos.lower()
chars = sorted(list(set(dinos)))
dinos = dinos.split('\n')

In [3]:
dinos[:10]

['aachenosaurus',
 'aardonyx',
 'abdallahsaurus',
 'abelisaurus',
 'abrictosaurus',
 'abrosaurus',
 'abydosaurus',
 'acanthopholis',
 'achelousaurus',
 'acheroraptor']

In [4]:
ix_to_char = {i:ch for i, ch in enumerate(chars)}
char_to_ix = {ch:i for i, ch in enumerate(chars)}

In [5]:
ix_to_char

{0: '\n',
 1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z'}

In [6]:
char_to_ix

{'\n': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26}

### encode the dinos data

In [7]:
dinos_encoded = []
for dino in dinos:
    dinos_encoded.append([char_to_ix[n] for n in dino])

In [8]:
dinos_onehot = []
for dino in dinos_encoded:
    dino_onehot = []
    for char in dino:
        x = np.zeros(len(chars))
        x[char] = 1
        dino_onehot.append(x)
    dinos_onehot.append(np.asarray(dino_onehot))
dinos_onehot = np.asarray(dinos_onehot)

In [9]:
dinos_onehot[0].shape

(13, 27)

### Prepare data for input/output

In [10]:
X = dinos_onehot.copy()

In [11]:
Y = []
for x in X:
    y = x[1:]
    new_line = np.zeros(len(chars))
    new_line[char_to_ix['\n']] = 1
    y = np.append(y, [new_line], axis=0)
    Y.append(y)
Y = np.asarray(Y)

## Sample dino name

In [12]:
def decode_onehot(onehot, ix_to_char):
    idx = np.argmax(onehot)
    return ix_to_char[idx]

def encode_onehot(n_x, loc):
    x = np.zeros((n_x, ))
    x[loc] = 1
    return x

def gen_dino_by_model(model, ix_to_char, n_x, max_len=25):
    x = np.zeros((1, n_x))
    for i in range(max_len):
        predicted = model.predict(x.reshape(-1, x.shape[0], x.shape[1]))
        y_prob = predicted[-1][-1]
        loc = np.random.choice(range(n_x), p=y_prob)
        x_next = encode_onehot(len(chars), loc)
        if len(x) < 3 and decode_onehot(x_next, ix_to_char) == '\n':
            pass
        else:
            x = np.append(x, [x_next], axis=0)
        if loc == 0 and len(x) >= 3:
            break
    dino_name = []
    for onehot in x[1:]:
        dino_name.append(decode_onehot(onehot, ix_to_char))
    return ''.join(dino_name[:-1])

## Model

variable input length problem 
- [Variable Sequence Lengths in TensorFlow](https://danijar.com/variable-sequence-lengths-in-tensorflow/)
- [varying-sequence-length-in-keras-without-padding](https://stackoverflow.com/questions/44873387/varying-sequence-length-in-keras-without-padding)
- [training-an-rnn-with-examples-of-different-lengths-in-keras](https://datascience.stackexchange.com/questions/26366/training-an-rnn-with-examples-of-different-lengths-in-keras)
- [Introduction to Recurrent Neural Networks (RNN) with Dinosaurs](https://towardsdatascience.com/introduction-to-recurrent-neural-networks-rnn-with-dinosaurs-790e74e3e6f6)

In [13]:
n_a = 50
n_x = len(chars)
model = Sequential()
simple_rnn = SimpleRNN(n_a, return_sequences=True, input_shape=(None, n_x))
model.add(simple_rnn)
dense = Dense(units=len(chars), activation='softmax')
model.add(dense)

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn (SimpleRNN)       (None, None, 50)          3900      
_________________________________________________________________
dense (Dense)                (None, None, 27)          1377      
Total params: 5,277
Trainable params: 5,277
Non-trainable params: 0
_________________________________________________________________


### train แต่ละ epoch ด้วย 1 ชื่อ
เนื่องจาก ความยาวชื่อไม่เท่ากัน

#### learn แต่สุ่มไม่ถูก (แบบคำนวณเอง เลยใช้โมเดล predict)

In [15]:
opt = Adam(learning_rate=0.001, clipvalue=5)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])


generated_dinos = {}

iterations = 31000
for i in range(iterations):
    dino = i % len(dinos)
    x = X[dino]
    y = Y[dino]
    x = x.reshape(-1, x.shape[0], x.shape[1])
    y = y.reshape(-1, y.shape[0], y.shape[1])
    
    model.train_on_batch(x, y)
    
    # generate new dinosaur name
    if i % 2000 == 0:
        predicted = model.predict(x)
        predicted = np.argmax(predicted, axis=2).ravel().tolist()
        predicted = [ix_to_char[i] for i in predicted]
        print(f"iterations: {i}, train sample: {dinos[dino]}")
        print(f"predicted: {''.join(predicted)}")
        max_dino = 10
        max_dino_name = 20
        dinos_name = []
        for j in range(max_dino):
            dino_name = gen_dino_by_model(model, ix_to_char, n_x, max_dino_name)
            dinos_name.append(dino_name)
        generated_dinos[i] = dinos_name

iterations: 0, train sample: aachenosaurus
predicted: yyqqqgycyakau
iterations: 2000, train sample: eousdryosaurus
predicted: rnsauoss
urus

iterations: 4000, train sample: neovenator
predicted: grsenator

iterations: 6000, train sample: traukutitan
predicted: eotsusosan

iterations: 8000, train sample: coelurus
predicted: hnnosos

iterations: 10000, train sample: limnornis
predicted: aaaosaas

iterations: 12000, train sample: sinocalliopteryx
predicted: aaoshraoansoryx

iterations: 14000, train sample: baotianmansaurus
predicted: ardor
ausu
urus

iterations: 16000, train sample: hypsilophodon
predicted: apharoshosos

iterations: 18000, train sample: protognathosaurus
predicted: aoponoithosaurus

iterations: 20000, train sample: agujaceratops
predicted: ursineratops

iterations: 22000, train sample: fabrosaurus
predicted: unrosaurus

iterations: 24000, train sample: omosaurus
predicted: rasaurus

iterations: 26000, train sample: unenlagia
predicted: rinoosoa

iterations: 28000, train s

In [16]:
for k, v in generated_dinos.items():
    print(f"iteration: {k}")
    for dino in v[:5]:
        print(dino)
    print('\n')

iteration: 0
pylldujnnpwchrkxuxt
fa
dvmfiohbsdfjnzqwqwe
snesdqmsznrkrefnxdb
zztzcaxklgojweiwzmn


iteration: 2000
abgoduc
vraurap
vodor
araptotps
olyna


iteration: 4000
urjua
onojodasaurus
onelisaeras
engosasaun
pivoseurus


iteration: 6000
eneosaurus
tihulita
onomeris
anchiania
egumosaurus


iteration: 8000
onerusaurus
eixosaurus
urima
ysthuria
rasyesaurashus


iteration: 10000
apaluphosaurus
aplosiuihilhsaurusa
uriengoxesaurus
eneisaurus
bolosaucus


iteration: 12000
felosauros
chybosaurus
rayongusoun
aosaurus
eusa


iteration: 14000
onzatisus
iongosauau
lopheana
utastelongus
runaconltor


iteration: 16000
ptolin
rokisaurus
riperosaurus
eicasaurus
ingdamosaurux


iteration: 18000
egosaurus
chalosaurus
iauanatata
atasaurus
etrapshiploce


iteration: 20000
ehceshundon
argbelons
iloseunos
menigong
rinyaniton


iteration: 22000
ridcodosaurus
umonvongodon
urosaurus
omomosaurus
impeota


iteration: 24000
ua
leskonesaurus
ruinosaurus
agsacrgaa
apalia


iteration: 26000
onitosaurus
ughitia
