# Setup

In [1]:
import numpy as np

# Helper Functions
- Softmax

In [2]:
def softmax(x):
    s = np.exp(x - np.max(x))
    return s / s.sum(axis=0)

- Smooth: make the loss curve smoother

In [3]:
def smooth(loss, cur_loss):
    return loss * 0.999 + cur_loss * 0.001

- Initial Loss: to smooth the loss

In [4]:
def get_initial_loss(vocab_size, seq_length):
    return -np.log(1.0/vocab_size) * seq_length

- Print Sample: print the generated sample names

In [5]:
def print_sample(sample_indices, idx_to_char):
    
    txt = ''.join(idx_to_char[idx] for idx in sample_indices)
    res = txt[0].upper() + txt[1:]  # capitalize the first character
    print('%s' % res, end='')

# Data
## Overview

In [6]:
data = open('dinos.txt', 'r').read().lower()
chars = sorted(list(set(data)))

data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in the data' % (data_size, vocab_size))

There are 19909 total characters and 27 unique characters in the data


In [7]:
print(chars)

['\n', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


## Preprocessing

In [8]:
# hashtable mapping characters to the indices
char_to_idx = {char:i for i,char in enumerate(chars)}

# hashtable mapping indices to characters
idx_to_char = {i:char for i, char in enumerate(chars)}

# RNN Cell
- Forward Propagation

In [9]:
def rnn_forward_step(params, a_prev, x):
    
    Waa, Wax, Wya, ba, by = params['Waa'], params['Wax'], params['Wya'], params['ba'], params['by']
    a_next = np.tanh(np.dot(Waa, a_prev) + np.dot(Wax, x) + ba)
    y_hat = softmax(np.dot(Wya, a_next) + by)
    
    return a_next, y_hat

In [10]:
def rnn_forward(X, Y, a0, params, n_x=vocab_size):
    
    # Initialize
    x, a, y_hat = {}, {}, {}
    a[-1] = np.copy(a0)
    
    loss = 0
    
    # Loop
    for t in range(len(X)):
        
        ## x[t]: one-hot vector representation of the t-th character in X
        x[t] = np.zeros((n_x, 1))
        if X[t] != None:
            x[t][X[t]] = 1
        
        ## run one forward step of the RNN
        a[t], y_hat[t] = rnn_forward_step(params, a[t-1], x[t])
        
        ## update the loss by substracting the cross-entropy term of this time-step
        loss -= np.log(y_hat[t][Y[t], 0])
    
    cache = (y_hat, a, x)
    
    return loss, cache

- Backward Propagation

In [11]:
def rnn_backward_step(dy, grads, params, x, a, a_prev):
    
    grads['dWya'] += np.dot(dy, a.T)
    grads['dby'] += dy
    da = np.dot(params['Wya'].T, dy) + grads['da_next']
    dz = da * (1 - a * a)
    grads['dWaa'] += np.dot(dz, a_prev.T)
    grads['dWax'] += np.dot(dz, x.T)
    grads['dba'] += dz
    grads['da_next'] = np.dot(params['Waa'].T, dz)
    
    return grads

In [12]:
def rnn_backward(X, Y, params, cache):
    
    # Retrieve from cache and params
    (y_hat, a, x) = cache
    Waa, Wax, Wya, ba, by = params['Waa'], params['Wax'], params['Wya'], params['ba'], params['by']
    
    # Initialize
    grads = {}
    grads['dWaa'], grads['dWax'], grads['dWya'] = np.zeros_like(Waa), np.zeros_like(Wax), np.zeros_like(Wya)
    grads['dba'], grads['dby'] = np.zeros_like(ba), np.zeros_like(by)
    grads['da_next'] = np.zeros_like(a[0])
    
    # Loop
    for t in reversed(range(len(X))):
        
        ## dy
        dy = np.copy(y_hat[t])
        dy[Y[t]] -= 1
        
        ## run one backward step of the RNN
        grads = rnn_backward_step(dy, grads, params, x[t], a[t], a[t-1])
    
    return grads, a

# Models
- Initialization

In [13]:
def initialize_params(n_a, n_x, n_y):
    
    np.random.seed(1)
    
    Wax = np.random.randn(n_a, n_x) * 0.01
    Waa = np.random.randn(n_a, n_a) * 0.01
    Wya = np.random.randn(n_y, n_a) * 0.01
    ba = np.zeros((n_a, 1))
    by = np.zeros((n_y, 1))
    
    params = {'Wax': Wax, 'Waa': Waa, 'Wya': Wya, 'ba': ba,'by': by}
    
    return params

- Gradient Clipping

In [14]:
def grads_clipping(grads, max_value):
    
    dWaa, dWax, dWya, dba, dby = grads['dWaa'], grads['dWax'], grads['dWya'], grads['dba'], grads['dby']
   
    for gradient in [dWaa, dWax, dWya, dba, dby]:
        np.clip(gradient, -max_value, max_value, out=gradient)
    
    grads = {'dWaa': dWaa, 'dWax': dWax, 'dWya': dWya, 'dba': dba, 'dby': dby}
    
    return grads

- Update Parameters

In [15]:
def update_GD(params, grads, lr):
    
    for param in ('Waa', 'Wax', 'Wya', 'ba', 'by'):
        params[param] -= lr * grads['d' + param]
        
    return params

- Sampling: generate new sequences after training

In [16]:
def sampling(params, char_to_idx):
    
    # Retrieve relevant shapes
    n_a, n_x = params['Wax'].shape[0], params['Wax'].shape[1]
    
    # Initialize
    x = np.zeros((n_x, 1))
    a_prev = np.zeros((n_a, 1))
    
    res = []  ## result list of indices of the characters to generate
    
    idx = -1
    counter = 0
    idx_newline = char_to_idx['\n']
    
    # Loop
    while idx != idx_newline and counter < 50:
        
        ## forward propagation
        a_prev, y_hat = rnn_forward_step(params, a_prev, x)
        
        ## sample the index of a character from probability distribution y_hat 
        idx = np.random.choice(range(n_x), p=y_hat.ravel())
        
        ## append the index to result
        res.append(idx)
        
        # overwrite the input with one-hot vector corresponding to the sampled index
        x = np.zeros((n_x, 1))
        x[idx] = 1
                
        counter += 1
        
    if counter == 50:
        res.append(idx_newline)
    
    return res

- Optimize: implement one step of Gradient Descent optimization

In [17]:
def optimize(X, Y, vocab_size, a_prev, params, max_value=5, lr=0.01):
    
    # Forward propagation
    loss, cache = rnn_forward(X, Y, a_prev, params, n_x=vocab_size)
    
    # Backward propagation
    grads, a = rnn_backward(X, Y, params, cache)
    
    # Gradient clipping
    grads = grads_clipping(grads, max_value)
    
    # Update parameters
    params = update_GD(params, grads, lr)
    
    return loss, grads, a[-1]

# Training

In [18]:
def language_model(data, idx_to_char, char_to_idx, epochs=35000, n_a=50, n_samples=7, vocab_size=vocab_size):
    
    # Retrieve relevant shapes
    n_x, n_y = vocab_size, vocab_size
    
    # Initialize parameters
    params = initialize_params(n_a, n_x, n_y)
    a_prev = np.zeros((n_a, 1))
    
    # Initialize loss: to smooth the loss
    loss = get_initial_loss(vocab_size, n_samples)
    
    # Generate training examples: a list of all dinosaur names
    examples = data.split('\n')
    
    # Shuffle the training examples
    np.random.seed(0)
    np.random.shuffle(examples)
    
    # Loop for SGD
    for j in range(epochs):
        
        idx = j % len(examples)
        
        # generate input X: prepend None to initialize x0 = 0
        single_example = examples[idx]
        X = [None] + [char_to_idx[char] for char in single_example]
        
        # generate label Y: append '\n'
        idx_newline = char_to_idx['\n']
        Y = X[1:] + [idx_newline]
        
        # optimization step
        curr_loss, grads, a_prev = optimize(X, Y, vocab_size, a_prev, params)
        
        # accelerate training: use a latency trick to keep the loss smooth
        loss = smooth(loss, curr_loss)
        
        # every 2000 epochs, generate samples tp check if the model is learning prorperly
        if j % 2000 == 0:
            
            print('Iteration: %d | Loss: %f' % (j, loss) + '\n')
            
            for _ in range(n_samples):
                sampled_indices = sampling(params, char_to_idx)
                print_sample(sampled_indices, idx_to_char)
            
            print('\n')

    return params

In [19]:
params = language_model(data, idx_to_char, char_to_idx)

Iteration: 0 | Loss: 23.087336

Co
Eyhqgyshubjerbjgmis
Qfguyhvbqtdkxb
Okuxhjmozrjgycsxqkeisqywvxgijorg
Jhywwqinubuavuapmdctozzzcyflteoebci
Kqjbagdzduoomziuojypckurxgtloyzwiwqk
Tmutuhunlr


Iteration: 2000 | Loss: 27.336073

Alydibenus
Atokycrherosaurus
Icrostovod
Gtrycathotisaurus
Motosaurus
Hqmacisgurus
Rwerosiulos


Iteration: 4000 | Loss: 25.118910

Lelocorasaurus
Eutorytops
Kytacronosaurus
Pagchiisyus
Ameorkusaurus
Aurosaurus
Lithysaurus


Iteration: 6000 | Loss: 23.797897

Prarosaurus
Alaurusaurus
Tyakeelosaurus
Yraenosaurus
Thorolortatra
Rostenon
Leroptows


Iteration: 8000 | Loss: 23.276764

Bramutos
Lolaimasaura
Erosmusus
Nazerosaurus
Crleuycianosaurus
Akbeopoptyrus
Taahiratops


Iteration: 10000 | Loss: 23.055204

Lovocogangosaurus
Lomomasaurus
Urolakosanicasaurus
Clalnosaurus
Diuliatenaurus
Baraiachamia
Aostanolosaurus


Iteration: 12000 | Loss: 22.395661

Holcessaurus
Avisaurus
Cengasinurosaurus
Athushunosheasamrus
Xyurlasus
Cosvoraptor
Sengtassurus


Iteration: 14000 | Loss