In [7]:
import numpy as np
import random
from utils import *

## Dataset and Preprocessing

In [8]:
data = open("dinos.txt", 'r').read()
data = data.lower()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))

There are 19909 total characters and 27 unique characters in your data.


In [9]:
char_to_ix = {ch:i for i, ch in enumerate(sorted(chars))}
ix_to_char = {i:ch for i, ch in enumerate(sorted(chars))}
ix_to_char

{0: '\n',
 1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z'}

In [10]:
def clip(gradients, maxValue):
    """
    Clips the gradients values between minimum and maximum.
    """
    dWaa, dWax, dWya, db, dby = gradients['dWaa'], gradients['dWax'], gradients['dWya'], gradients['db'], gradients['dby']
    
    for gradient in [dWaa, dWax, dWya, db, dby]:
        np.clip(gradient, -maxValue, maxValue, out=gradient)
        
    gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "db": db, "dby": dby}
    
    return gradients

## Sampling

When the model is trained. We would like to generate text(characters).

In [11]:
def sample(parameters, char_to_ix, seed):
    """
    Sample a sequence of characters according to a sequence of probability distributions output of the RNN
    """
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    vocab_size = by.shape[0]
    n_a = Waa.shape[1]
    
    x = np.zeros((vocab_size, 1))
    a_prev = np.zeros((n_a, 1))
    
    ## this is a list which will contain the list of indices of the characters to generate
    indices = []
    ## this is to detect a newline character
    idx = -1
    
    ## we will keep generating until we hit 50 characters
    counter = 0 
    newline_character = char_to_ix["\n"]
    
    while (idx != newline_character and counter != 50):
        ## Forward propagate x 
        a = np.tanh(np.dot(Wax, x) + np.dot(Waa, a_prev) + b)
        z = np.dot(Wya, a) + by
        y = softmax(z)
        
        ## sample the index of a character within the vocabulary from the probability distribution y
        ## Picking a int in range(vocac_size) using the distribution we got from out prediction i.e., y
        idx = np.random.choice(list(range(vocab_size)), p=y.ravel())
        
        indices.append(idx)
        
        x = np.zeros((vocab_size, 1))
        x[idx] = 1
        
        a_prev = a
        
        counter += 1
    if (counter == 50):
        indices.append(char_to_ix["\n"])
    
    return indices

In [12]:
np.random.seed(2)
_, n_a = 20, 100
Wax, Waa, Wya = np.random.randn(n_a, vocab_size), np.random.randn(n_a, n_a), np.random.randn(vocab_size, n_a)
b, by = np.random.randn(n_a, 1), np.random.randn(vocab_size, 1)
parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "b": b, "by": by}


indices = sample(parameters, char_to_ix, 0)
print("Sampling:")
print("list of sampled indices:", indices)
print("list of sampled characters:", [ix_to_char[i] for i in indices])

Sampling:
list of sampled indices: [12, 23, 24, 14, 7, 2, 10, 23, 25, 12, 3, 24, 15, 24, 3, 20, 3, 17, 4, 0]
list of sampled characters: ['l', 'w', 'x', 'n', 'g', 'b', 'j', 'w', 'y', 'l', 'c', 'x', 'o', 'x', 'c', 't', 'c', 'q', 'd', '\n']


## Building the language model

## Gradient descent

In [13]:
def optimize(X, Y, a_prev, parameters, learning_rate = 0.01):
    """
    Execute one step of the optimization to train the model"""
    loss, cache = rnn_forward(X, Y, a_prev, parameters)
    
    gradients, a = rnn_backward(X, Y, parameters, cache)
    
    gradients = clip(gradients, 5)
    
    parameters = update_parameters(parameters, gradients, learning_rate)
    
    return loss, gradients, a[len(X)-1]

## Training the model

In [14]:
def model(data, ix_to_char, char_to_ix, num_iterations = 35000, n_a = 50, dino_names = 7, vocab_size = 27):
    """
    Trains the model and generates the dinosaur names.
    """
    
    n_x, n_y = vocab_size, vocab_size
    parameters = initialize_parameters(n_a, n_x, n_y)
    loss = get_initial_loss(vocab_size, dino_names)
    
    ## Build list of all dinosaue names (training examples).
    with open("dinos.txt", 'r') as f:
        examples = f.readlines()
    ## .strip() removes whitespaces from starting and ending
    examples = [x.lower().strip() for x in examples]
    
    ## shuffle list
    np.random.seed(42)
    np.random.shuffle(examples)
    
    a_prev = np.zeros((n_a, 1))
    
    # optimization loop
    for j in range(num_iterations):
        
        ## define 1 training example (X, Y)
        index = j % len(examples)
        X = [None] + [char_to_ix[ch] for ch in examples[index]]
        Y = X[1:] + [char_to_ix["\n"]]
        
        ## Perform one optimization step: Forward-prop -> Back-prop -> Clip -> Update parameters
        curr_loss, gradients, a_prev = optimize(X, Y, a_prev, parameters)
        
        loss = smooth(loss, curr_loss)

        # Every 2000 Iteration, generate "n" characters thanks to sample() to check if the model is learning properly
        if j % 2000 == 0:
            
            print('Iteration: %d, Loss: %f' % (j, loss) + '\n')
            
            # The number of dinosaur names to print
            seed = 0
            for name in range(dino_names):
                
                # Sample indices and print them
                sampled_indices = sample(parameters, char_to_ix, seed)
                print_sample(sampled_indices, ix_to_char)
                
                seed += 1  # To get the same result for grading purposed, increment the seed by one. 
      
            print('\n')
        
    return parameters

In [16]:
with open("dinos.txt", 'r') as f:
        examples = f.readlines()
    ## .strip() removes whitespaces from starting and ending
examples = [x.lower().strip() for x in examples]
examples

['aachenosaurus',
 'aardonyx',
 'abdallahsaurus',
 'abelisaurus',
 'abrictosaurus',
 'abrosaurus',
 'abydosaurus',
 'acanthopholis',
 'achelousaurus',
 'acheroraptor',
 'achillesaurus',
 'achillobator',
 'acristavus',
 'acrocanthosaurus',
 'acrotholus',
 'actiosaurus',
 'adamantisaurus',
 'adasaurus',
 'adelolophus',
 'adeopapposaurus',
 'aegyptosaurus',
 'aeolosaurus',
 'aepisaurus',
 'aepyornithomimus',
 'aerosteon',
 'aetonyxafromimus',
 'afrovenator',
 'agathaumas',
 'aggiosaurus',
 'agilisaurus',
 'agnosphitys',
 'agrosaurus',
 'agujaceratops',
 'agustinia',
 'ahshislepelta',
 'airakoraptor',
 'ajancingenia',
 'ajkaceratops',
 'alamosaurus',
 'alaskacephale',
 'albalophosaurus',
 'albertaceratops',
 'albertadromeus',
 'albertavenator',
 'albertonykus',
 'albertosaurus',
 'albinykus',
 'albisaurus',
 'alcovasaurus',
 'alectrosaurus',
 'aletopelta',
 'algoasaurus',
 'alioramus',
 'aliwalia',
 'allosaurus',
 'almas',
 'alnashetri',
 'alocodon',
 'altirhinus',
 'altispinax',
 'alvarez

In [15]:
parameters = model(data, ix_to_char, char_to_ix)

Iteration: 0, Loss: 23.093927

Fsmberbvmlpvirogwuqvwsvrrptdww
Vcitdvvm
Gpzqgqnubtnyiqybyrahsaoipawzztct

Hmtrlgzkldurebrqgydkykqjgzkxfe
Qiwlzdwttvtpc
Xpumccrkemaoguhl


Iteration: 2000, Loss: 27.794811

Anineros
Iahyanosa
Enalacganosanaroctanosaurunusatonodalateveesaurusy
Yoknaailoscgelyatopsaurus
Tannogosattdpncrineoheuronoeus
Aeresannolabosaurus
Alantaditen


Iteration: 4000, Loss: 25.820948

Axxaurus
Modanon
X
Riterotops
Octor
Reparzaurus
Sthhoscsauruc


Iteration: 6000, Loss: 24.637894

Kinimaneciosaurus
Imenlosaures
Ruetateidemcophoran
Endoraqdus
Panxosaurus
Telonops
Rinasideclopnarlybonasaurus


Iteration: 8000, Loss: 24.183997

Hactenbhanasaurus
Inzonosaurus
Odionnsuelaxsauruinator
Hematngmosaurus
Uangusmredidsaurus
Thosaurus
Akururaps


Iteration: 10000, Loss: 23.762122

Qeonecin
Zapulegeclanzus
Phiconchiros
Kiahofle
Evptodontaphisaurus
Volongocopintosaurus
Acia


Iteration: 12000, Loss: 23.497971

Avmilisaurus
Eacurosaurus
Naxokus
Strejuqiniosaurus
Strrisaurus
Nyx
Dapschus


I