# Character level language model - Dinosaurus land
### Keras Implementation of the Deep-Learning Specializiation project


Naming new dinos using RNNs

In [1]:
from keras.models import Model
from keras.layers import Input, SimpleRNN, Activation, Dense
from keras.utils import to_categorical
from keras.callbacks import LambdaCallback
from numpy.random import seed
import tensorflow as tf
import random
import sys

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def decode_ix(encoding):
    """
    Decode an encoded dinosaur sequence presented as an
    (n,1) numpy array
    """
    string = "".join(ix_to_char[ix] for ix in encoding.ravel())
    return string

In [3]:
import numpy as np
with open("dinos.txt", "r") as f:
    dinos = f.read().lower()
characters = sorted(list(set(dinos)))

We look at a small sample of the names in the training dataset

In [4]:
seed(1643)
np.random.choice(dinos.split(), 7)

array(['opisthocoelicaudia', 'skeleton', 'marshosaurus', 'tototlmimus',
       'ozraptor', 'hylosaurus', 'elvisaurus'], dtype='<U23')

In [5]:
# Map from all characters in the training dataset to a uniquex index
char_to_ix = {ix:char for char, ix in enumerate(characters)}
# Reverse map of char_to_ix to retrieve the index given the character
ix_to_char = {char:ix for char, ix in enumerate(characters,)}

nchars = len(characters)
nvocab = len(dinos)

In [6]:
char_to_ix

{'\n': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26}

In [7]:
# Size of the sequence to pass as training example
seqlen = 10
xtrain, ytrain = [], []
for i in range(0, nvocab - seqlen):
    xt = dinos[i: i + seqlen]
    yt = dinos[i + seqlen]
    xtrain.append([char_to_ix[char] for char in xt])
    ytrain.append([char_to_ix[char] for char in yt])
    
# training dataset is now
# of shape (xtrain X seqlen)
print(f"Number of training instances: {len(xtrain):,}")

# Reshaping into the form:
# nfeatures X timesteps X features
xtrain = np.reshape(xtrain, (-1, seqlen, 1))
# Normalizing values
#xtrain = xtrain / nvocab
# Transorming output values to be
# One-hot encoded
ytrain = to_categorical(ytrain)
xtrain.shape, ytrain.shape

Number of training instances: 19,903


((19903, 10, 1), (19903, 27))

First five instances in the training dataset: A `seqlen` number characters followed by the next character of the dinosaur.

In [8]:
for i in range(5):
    print(xtrain[i].ravel())
    print(repr(decode_ix(xtrain[i])))
    print(repr(ix_to_char[np.where(ytrain[i] == 1)[0][0]]))
    print()

[ 1  1  3  8  5 14 15 19  1 21]
'aachenosau'
'r'

[ 1  3  8  5 14 15 19  1 21 18]
'achenosaur'
'u'

[ 3  8  5 14 15 19  1 21 18 21]
'chenosauru'
's'

[ 8  5 14 15 19  1 21 18 21 19]
'henosaurus'
'\n'

[ 5 14 15 19  1 21 18 21 19  0]
'enosaurus\n'
'a'



In [9]:
tf.reset_default_graph()
### Constructing an RNN model with keras ###
# Input with shape of a single training instance, we
# do not take into account the number of training examples
xin = Input(shape=xtrain.shape[1:])
# A recurrent neural network cell with tanh activation function
# and 256 units
x = SimpleRNN(256, activation="tanh")(xin)
# The output of each training example, after feeding
# sqlen characters is a desne "feedforward" neural network
# with softax activation function
x = Dense(ytrain.shape[1], activation="softmax")(x)

model = Model(inputs=xin, outputs=x)
model.compile(optimizer="adam", loss="categorical_crossentropy")
model.fit(xtrain, ytrain, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1818f8ff60>

In [13]:
# pick a random seed
start = np.random.randint(0, len(xtrain)-1)
pattern = xtrain[start].ravel()
print("Seed:", end="")
print(repr(''.join([ix_to_char[value] for value in pattern])))
# generate characters
for i in range(150):
    x = np.reshape(pattern, (1, len(pattern), 1))
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = ix_to_char[index]
    seq_in = [ix_to_char[value] for value in pattern]
    print(result, end="")
    pattern = np.append(pattern, index)
    pattern = pattern[1:len(pattern)]
print("\n...Done...")

Seed:'us\nhuabeis'
aurus
araucanoraptor
aratososaurus
araucanoraptor
aratososaurus
araucanoraptor
aratososaurus
araucanoraptor
aratososaurus
araucanoraptor
aratososaurus
...Done...


In [None]:
# Training dataset: Outputs
dinos_list = [[char_to_ix[c] for c in d + "\n"] for d in dinos.split()]
maxlen =  len(max(dinos_list, key=len))
ndinos = len(dinos_list)
nchars = len(characters)
xtrain = np.zeros((ndinos, maxlen, nchars))
ytrain = np.zeros((ndinos, maxlen, nchars))

xtrain = np.zeros((ndinos, maxlen))


In [None]:
tf.reset_default_graph()
xin = Input(shape=(maxlen,))
x = SimpleRNN(10, activation="tanh")(xin)
x = Dense()
x = Activation("softmax")(x)

model = Model(inputs=xin, outputs=x)
model.compile("adam", loss="categorical_crossentropy")
model.fit(xtrain, ytrain)

In [16]:
model.predict(np.zeros((1, maxlen, nchars)))[0]

array([0.03703704, 0.03703704, 0.03703704, 0.03703704, 0.03703704,
       0.03703704, 0.03703704, 0.03703704, 0.03703704, 0.03703704,
       0.03703704, 0.03703704, 0.03703704, 0.03703704, 0.03703704,
       0.03703704, 0.03703704, 0.03703704, 0.03703704, 0.03703704,
       0.03703704, 0.03703704, 0.03703704, 0.03703704, 0.03703704,
       0.03703704, 0.03703704], dtype=float32)