#Pokemon Name Generator using RNN and SGD

In [1]:
import numpy as np
import random
from random import shuffle
import csv
from urllib.request import urlopen
import codecs

In [150]:
def get_names():
  url = 'https://raw.githubusercontent.com/elvorfirilmathredia/PokemonRNN/main/pokemon.csv'
  stream = urlopen(url)
  csvfile = csv.reader(codecs.iterdecode(stream, 'utf-8'))

  data = np.array(list(csvfile))
  names = data[1:, 3]
  names = map(lambda s: s.lower(), names)
  uniq_names = list(set(names))
  return np.array(uniq_names)

In [3]:
def optimize(X, Y, a_prev, parameters, learning_rate = 0.01):
  """
  Execute one step of stochastic gradient descend

  Model is given by a simple RNN cell:

                                probs = softmax(Wya * a_next + by)
                                  ^
                                  |
              +-------------------|-----+
              |                   |     |
              |     +------+      |     |
    a_prev -------->| tanh |------+----------> a_next = tanh(Wax * x + Waa * a_prev + b)
              |     +------+            |
              |         ^               |
              +---------|---------------+
                        |
                        |
                        x

  Arguments:
    X (np.array[int]): list of integers, each representing a character in the vocabulary
    Y (np.array[int]): same as X, but shifted one index to the left
    a_prev (): previous hidden state
    parameters (): python dict containing:
      Wax (np.array[float32], (n_a, n_x)): Weight matrix for input
      Waa (np.array[float32], (n_a, n_a)): Weight matrix for hidden state
      Wya (np.array[float32], (n_y, n_a)): Weight matrix
      b (np.array[float32], (n_a, 1)): Bias
      by (np.array[float32], (n_y, 1)): Bias for hidden state
    learning_rate (float32): learning rate for model
  
  Returns:
    loss (float32): current loss
    gradients: a dictionary containing the gradients "dWaa", "dWax", "dWya", "db", "dby"
    a_last (np.array[float32], (n_a, 1)): the last/final hidden state output 
  """
  Wax = parameters["Wax"]
  _, n_x = Wax.shape

  loss, cache = rnn_forward(X, Y, a_prev, parameters, n_x)

  gradients = rnn_backward(X, Y, parameters, cache)

  # clip to prevent exploding gradients
  gradients = clip(gradients, 5)

  parameters = update_parameters(parameters, gradients, learning_rate)

  a, _, _ = cache
  a_last = a[len(X)-1]

  return loss, gradients, a_last

In [4]:
def int_to_oh(m, n):
  """Return vector of shape (m,1) with a 1 at index n"""
  return np.eye(m,1,-n)

def softmax(x):
  e_x = np.exp(x)
  return e_x / e_x.sum()

def rnn_forward_step(parameters, a_prev, x):
  """
  Perform one forward step through RNN cell

  Arguments:
    parameters: a dictionary containing the weights "Waa", "Wax", "Wya", "b", "by"
    a_prev (np.array[float32], (n_a, 1)): previous hidden state
    x (np.array[float32], (n_x, 1)): input x
  
  Returns:
    a_next (np.array[float32], (n_a, 1)): next hidden state
    p (np.array[float32], (n_y, 1)): softmax output
  """
  Waa, Wax, Wya = parameters['Waa'], parameters['Wax'], parameters['Wya']
  b, by = parameters['b'], parameters['by']
  a_next = np.tanh(np.dot(Wax, x) + np.dot(Waa, a_prev) + b)
  p = softmax(np.dot(Wya, a_next) + by)
  return a_next, p

def rnn_forward(X, Y, a0, parameters, n_x):
  """
  Forward pass of one sample through RNN

  Arguments:
    X (np.array[int]): list of integers, each representing a character in the vocabulary
    Y (np.array[int]): same as X, but shifted one index to the left
    a0 (np.array[float32], (n_a, 1)): initial hidden state
    parameters: a dictionary containing the weights "Waa", "Wax", "Wya", "b", "by"
    n_x: size of input
  
  Returns:
    loss: the loss 
    cache (a tuple containing):
      a: a dictionary containing all hidden states
      x: a dictionary containing all input values
      y_hat: a dictionary containing all softmax outputs
  """
  x, a, y_hat = {}, {}, {}

  a[-1] = np.copy(a0)
  loss = 0

  for t in range(len(X)):
    # We prepend each name with None,
    # in order to train the model to generate an initial character.
    # In this case we want x[t] (i.e. x[0]) to be the zero vector.
    x[t] = np.zeros((n_x, 1)) if t==0 else int_to_oh(n_x, X[t])

    a[t], y_hat[t] = rnn_forward_step(parameters, a[t-1], x[t])

    loss -= np.log(y_hat[t][Y[t],0])
  
  cache = (a, x, y_hat)
  return loss, cache

def rnn_backward_step(dy, gradients, parameters, x, a, a_prev):
  """
  Perform one backward step through RNN cell

  Arguments:
    dy (float32): the derivative of the loss wrt the cell output
    gradients: a dictionary containing the gradients "dWaa", "dWax", "dWya", "db", "dby"
    parameters: a dictionary containing the weights "Waa", "Wax", "Wya", "b", "by"
    x (np.array[float32], (n_x, 1)): input x    
    a (np.array[float32], (n_a, 1)): current hidden state
    a_prev (np.array[float32], (n_a, 1)): previous hidden state

  Returns:
    gradients: a dictionary containing the gradients "dWaa", "dWax", "dWya", "db", "dby"
  """
  gradients['dWya'] += np.dot(dy, a.T)
  gradients['dby'] += dy
  da = np.dot(parameters['Wya'].T, dy) + gradients['da_next'] # backprop into h
  da = (1 - a * a) * da # backprop through tanh
  gradients['db'] += da
  gradients['dWax'] += np.dot(da, x.T)
  gradients['dWaa'] += np.dot(da, a_prev.T)
  gradients['da_next'] = np.dot(parameters['Waa'].T, da)
  return gradients


def rnn_backward(X, Y, parameters, cache):
  """
  Backward pass of one sample through RNN

  Arguments:
    X (np.array[int]): list of integers, each representing a character in the vocabulary
    Y (np.array[int]): same as X, but shifted one index to the left
    parameters: a dictionary containing the weights "Waa", "Wax", "Wya", "b", "by"
    cache (a tuple containing):
      a: a dictionary containing all hidden states
      x: a dictionary containing all input values
      y_hat: a dictionary containing all softmax outputs
  
  Returns:
    gradients: a dictionary containing the gradients "dWaa", "dWax", "dWya", "db", "dby"
  """
  gradients = {}
    
  (a, x, y_hat) = cache
  Waa, Wax, Wya = parameters['Waa'], parameters['Wax'], parameters['Wya']
  by, b = parameters['by'], parameters['b']
    
  gradients['dWax'], gradients['dWaa'], gradients['dWya'] = np.zeros_like(Wax), np.zeros_like(Waa), np.zeros_like(Wya)
  gradients['db'], gradients['dby'] = np.zeros_like(b), np.zeros_like(by)
  gradients['da_next'] = np.zeros_like(a[0])

  n_y, _ = by.shape

  for t in reversed(range(len(X))):
    # Derivative of loss function wrt. output of RNN cell
    # Note that the softmax is already part of the derivation, so
    #   (∂l/∂a)_i = p_i - y_i.
    dy = np.copy(y_hat[t]) - int_to_oh(n_y, Y[t])

    gradients = rnn_backward_step(dy, gradients, parameters, x[t], a[t], a[t-1])

  return gradients

def update_parameters(parameters, gradients, lr):
  """
  Update parameters

  Arguments:
    parameters: a dictionary containing the weights "Waa", "Wax", "Wya", "b", "by"
    gradients: a dictionary containing the gradients "dWaa", "dWax", "dWya", "db", "dby"
    lr (float): learning rate
  
  Returns:
    parameters: a dictionary containing the weights "Waa", "Wax", "Wya", "b", "by"
  """
  parameters['Wax'] += -lr * gradients['dWax']
  parameters['Waa'] += -lr * gradients['dWaa']
  parameters['Wya'] += -lr * gradients['dWya']
  parameters['b']   += -lr * gradients['db']
  parameters['by']  += -lr * gradients['dby']
  return parameters

def initialize_parameters(n_a, n_x, n_y):
  """
  Initialize parameters with small random values

  Returns:
    parameters (python dict containing):
      Wax (np.array[float32], (n_a, n_x)): Weight matrix for input
      Waa (np.array[float32], (n_a, n_a)): Weight matrix for hidden state
      Wya (np.array[float32], (n_y, n_a)): Weight matrix
      b (np.array[float32], (n_a, 1)): Bias
      by (np.array[float32], (n_y, 1)): Bias for hidden state
  """
  Wax = np.random.randn(n_a, n_x)*0.01
  Waa = np.random.randn(n_a, n_a)*0.01
  Wya = np.random.randn(n_y, n_a)*0.01
  b = np.zeros((n_a, 1))
  by = np.zeros((n_y, 1))

  parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "b": b,"by": by}

  return parameters

def clip(gradients, maxValue):
  """
  Clip the gradients to values between -maxValue and maxValue

  Arguments:
    gradients: a dictionary containing the gradients "dWaa", "dWax", "dWya", "db", "dby"
    maxValue (float): value to clip gradients to

  Returns: 
    gradients: a dictionary containing the clipped gradients "dWaa", "dWax", "dWya", "db", "dby"
  """  
  dWaa, dWax, dWya, db, dby = gradients['dWaa'], gradients['dWax'], gradients['dWya'], gradients['db'], gradients['dby']

  for gradient in [dWax, dWaa, dWya, db, dby]:
    np.clip(gradient, -maxValue, maxValue, out=gradient)

  gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "db": db, "dby": dby}

  return gradients

In [5]:
def model(data, ix_to_char, char_to_ix, num_iterations = 30000, n_a = 64, samples = 5, vocab_size = 33):
  """
  Trains the model and generates Pokemon names.
  """

  n_x, n_y = vocab_size, vocab_size

  parameters = initialize_parameters(n_a, n_x, n_y)

  # initialize loss to 30 for loss smoothing
  # TODO: Do not hardcode value here
  loss = 30

  # Optimization loop
  for j in range(num_iterations): 
    index = j % len(data)
    X = [None] + [char_to_ix[ch] for ch in data[index]] 
    Y = X[1:] + [char_to_ix["\n"]]

    a0 = np.zeros((n_a, 1))

    curr_loss, gradients, _ = optimize(X, Y, a0, parameters)

    loss = loss * 0.999 + curr_loss * 0.001

    if j % 1000 == 0:
      print('Iteration: {}, Loss: {}'.format(j, loss))
  
  return parameters

In [6]:
def prepare_data():
  data = get_names()
  data_joined = '\n'.join(data)
  chars = list(set(data_joined))
  data_size, vocab_size = len(data_joined), len(chars)
  print('Have {} chars and {} unique chars in data'.format(data_size, vocab_size))

  char_to_ix = { ch:i for i, ch in enumerate(sorted(chars)) }
  ix_to_char = { i:ch for i, ch in enumerate(sorted(chars)) }

  return data, ix_to_char, char_to_ix

data, ix_to_char, char_to_ix = prepare_data()

Have 8329 chars and 33 unique chars in data


In [7]:
parameters = model(data, ix_to_char, char_to_ix, 35000)

Iteration: 0, Loss: 29.99447804207676
Iteration: 1000, Loss: 27.971893307648184
Iteration: 2000, Loss: 26.051693673165076
Iteration: 3000, Loss: 24.625848371307498
Iteration: 4000, Loss: 23.683525798149383
Iteration: 5000, Loss: 23.128550781998563
Iteration: 6000, Loss: 22.687326619802516
Iteration: 7000, Loss: 22.34770009679628
Iteration: 8000, Loss: 22.04635726071095
Iteration: 9000, Loss: 21.921351484318297
Iteration: 10000, Loss: 21.775476914380583
Iteration: 11000, Loss: 21.60787365968207
Iteration: 12000, Loss: 21.44917187963481
Iteration: 13000, Loss: 21.29305933048469
Iteration: 14000, Loss: 21.162041595084446
Iteration: 15000, Loss: 21.108951791405545
Iteration: 16000, Loss: 21.01940781273212
Iteration: 17000, Loss: 20.920096569657023
Iteration: 18000, Loss: 20.805374266552725
Iteration: 19000, Loss: 20.706402937431555
Iteration: 20000, Loss: 20.60566683665463
Iteration: 21000, Loss: 20.544844950233575
Iteration: 22000, Loss: 20.54020321481566
Iteration: 23000, Loss: 20.460541

In [8]:
def sample(parameters, char_to_ix):
  """
  Sample a sequence of characters according to a sequence of probability distributions output of the RNN

  Arguments:
    parameters: a python dictionary containing the parameters "Waa", "Wax", "Wya", "by" and "b"
    char_to_ix: python dictionary mapping each character to an index

  Returns:
    indices: a list of the indices of the sampled characters
  """
  Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
  vocab_size = by.shape[0]
  n_a = Waa.shape[1]

  x = np.zeros((vocab_size, 1))
  a_prev = np.zeros((n_a, 1))

  indices = []

  # idx will hold the currently sampled character, initialize to -1
  idx = -1 

  newline_character = char_to_ix['\n']

  # stop sampling once we get a newline_character or indices reaches 25 entries
  while (idx != newline_character and len(indices) < 25):
    # forward propagation
    a_prev, y = rnn_forward_step(parameters, a_prev, x)
        
    # sample index of a character from the probability distribution y
    idx = np.random.choice(list(range(vocab_size)), p = y.ravel())
    indices.append(idx)
        
    # set x to the input corresponding to the sampled index
    x = int_to_oh(vocab_size, idx)

  if (indices[-1] != newline_character):
    indices.append(newline_character)

  return indices

In [149]:
def indices_to_name(indices, ix_to_char):
  """
  Convert list of indices to human readable name

  Arguments:
    indices: list of indices each corresponding to a char in ix_to_char
    ix_to_char: dict containing a conversion of index to char

  Returns:
    name: human readable version of input
  """
  # ignore last index, since its always '\n'
  name = ''.join([ix_to_char[i] for i in indices[:-1]])
  return name.title()

print("Sampling:")
for _ in range(20):
  indices = sample(parameters, char_to_ix)
  print(" * {}".format(indices_to_name(indices, ix_to_char)))

Sampling:
 * Chamok
 * Amphara
 * Pircader
 * Crevochihm
 * Prracatal
 * Meka Mameetir
 * Zempel
 * Megf Aectadop
 * Sharbody
 * Llebinelaps
 * Tosteelur
 * Promaneoss
 * Ratoatty
 * Pltwan Hang
 * Raggitak
 * Sseneas
 * Bogpor
 * Cramolaton
 * Coraetl
 * Ppidaru
