using `numpy` to create recurrent neural networks (video from siraj raval)

In [2]:
data = open('nepali.txt', 'r').read()

chars = list(set(data)) # convert into unique characters
print(chars[1:100])

['m', 'i', '?', 'C', 'l', 'N', '(', 'P', ':', 'L', ';', '\n', 'U', 'o', ')', 'z', "'", 's', 'F', 'q', 'r', 'a', 'y', 'W', 'G', 'e', 'c', 'E', 'u', 'n', '"', 'M', 'I', 'V', 'w', 'p', '.', 'D', ',', 'S', '-', 'Q', 'k', 'O', 'B', 'd', 'ç', 'T', 'h', 't', 'b', ' ', 'x', '!', 'A', 'j', 'H', 'g', 'Y', 'J', 'v']


In [3]:
data_size, vocab_size = len(data), len(chars)
print('data has %d chars and %d unique chars'%(data_size, vocab_size))

data has 118560 chars and 62 unique chars


--dictionary to encode and decode char to an int

In [6]:
# char_to_integer and integer_to_char
char_to_number = { ch:i for i,ch in enumerate(chars) }
number_to_char = { i:ch for i,ch in enumerate(chars) }
print(number_to_char)

{0: 'f', 1: 'm', 2: 'i', 3: '?', 4: 'C', 5: 'l', 6: 'N', 7: '(', 8: 'P', 9: ':', 10: 'L', 11: ';', 12: '\n', 13: 'U', 14: 'o', 15: ')', 16: 'z', 17: "'", 18: 's', 19: 'F', 20: 'q', 21: 'r', 22: 'a', 23: 'y', 24: 'W', 25: 'G', 26: 'e', 27: 'c', 28: 'E', 29: 'u', 30: 'n', 31: '"', 32: 'M', 33: 'I', 34: 'V', 35: 'w', 36: 'p', 37: '.', 38: 'D', 39: ',', 40: 'S', 41: '-', 42: 'Q', 43: 'k', 44: 'O', 45: 'B', 46: 'd', 47: 'ç', 48: 'T', 49: 'h', 50: 't', 51: 'b', 52: ' ', 53: 'x', 54: '!', 55: 'A', 56: 'j', 57: 'H', 58: 'g', 59: 'Y', 60: 'J', 61: 'v'}


In [7]:
# create one hot encoded chars vectors
import numpy as np

vector_for_char_a = np.zeros((vocab_size, 1))
vector_for_char_a[char_to_number['a']] = 1
print(vector_for_char_a.ravel())

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


define the `model`

In [18]:
# first define hyperparameters
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

# now define model parameters (weights)
w_xh = np.random.randn(hidden_size, vocab_size) * 0.01   # input hidden weight matrix
w_hh = np.random.randn(hidden_size, hidden_size) * 0.01 # recurrent weight matrix

w_hy = np.random.randn(vocab_size, hidden_size) * 0.01 # recurrent weight matrix
bh = np.zeros((hidden_size, 1))
by = np.zeros((vocab_size, 1))

`xs` for inputs, `hs` for hidden state values
`ys` for targets and `ps` for normalized probability values

In [26]:
#now we make the model  
# for forward pass 
def lossFun(inputs, targets, hprev):
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    
    # now forward pass 
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size, 1))
        xs[t][inputs[t]] = 1 # one hot encoding for inputs
        
        hs[t] = np.tanh(np.dot(w_xh, xs[t]) + np.dot(w_hh, hs[t-1]) + bh)
        ys[t] = np.dot(w_hy, hs[t]) + by
        
        ps[t] = np.exp(ys[t]) / np.sum( np.exp(ys[t]) ) #softmax
        loss += -np.log(ps[t][targets[t], 0])
        
    # now backward_pass
    dw_xh, dw_hh, dw_hy = np.zeros_like(w_xh), np.zeros_like(w_hh), np.zeros_like(w_hy)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dh_next = np.zeros_like(hs[0])
    
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        
        # gradient : prob - 1 so,
        dy[targets[t]] -= 1
        
        dw_hy += np.dot(dy, hs[t].T)
        dby += dy
        
        dh = np.dot(w_hy.T, dy) + dh_next
        dhraw = (1-hs[t] * hs[t]) * dh
        dbh += dhraw
        
        dw_xh += np.dot(dhraw, xs[t].T)
        dw_hh += np.dot(dhraw, hs[t-1].T)
        dh_next = np.dot(w_hh.T, dhraw)
        
    for dparam in [dw_xh, dw_hh, dw_hy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)
    
    return loss, dw_xh, dw_hh, dw_hy, dbh, dby, hs[len(inputs)-1]

In [27]:
def sample(h, seed_ix, n):
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    
    ixes = [] # list to store the generated characters 
    for t in range(n):
        h = np.tanh(np.dot(w_xh, x) + np.dot(w_hh, h) + bh)
        y = np.dot(w_hy, h) + by
        
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
        
    txt = ''.join(number_to_char[ix] for ix in ixes)
    print('---\n %s \n-----' % txt)

hprev = np.zeros((hidden_size, 1))
sample(hprev, char_to_number['a'], 200)
        

---
 GzxJAYhDBfHbçAGO T DBo iiJ?DNçoltk"woC"IDVsQJdMz"Ht)-?.QJkU)uqBqxfWfgM.PtW-Jk,AWmF-ob;JnJpdh,W'n(s .HuW:iajHsFBTsu'iVILf.COktN'wG,j)TVA 
;yp n,NEAApeYwcd?Hc;'(FPvLSMNL(wdTC?hVv-AJkhWt,N"Mn;Eq'a,tz;CTm 
-----


In [29]:
p = 0
inputs = [ char_to_number[ch] for ch in data[p:p+seq_length] ]
targets = [ char_to_number[ch] for ch in data[p+1:p+seq_length+1] ]
print(inputs, targets)

[44, 30, 26, 52, 1, 14, 21, 30, 2, 30, 58, 39, 52, 35, 49, 26, 30, 52, 25, 21, 26, 58, 14, 21, 52] [30, 26, 52, 1, 14, 21, 30, 2, 30, 58, 39, 52, 35, 49, 26, 30, 52, 25, 21, 26, 58, 14, 21, 52, 40]


In [31]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(w_xh), np.zeros_like(w_hh), np.zeros_like(w_hy)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad                                                                                                                
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0                                                                                                                        
while n<=1000*100:
    if p+seq_length+1 >= len(data) or n == 0:
        hprev = np.zeros((hidden_size,1)) # reset RNN memory                                                                                                                                      
        p = 0 # go from start of data                                                                                                                                                             
    inputs = [char_to_number[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_number[ch] for ch in data[p+1:p+seq_length+1]]

      # forward seq_length characters through the net and fetch gradient                                                                                                                          
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001

    if n % 1000 == 0:
        print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
        sample(hprev, inputs[0], 200)

  # perform parameter update with Adagrad                                                                                                                                                     
    for param, dparam, mem in zip([w_xh, w_hh, w_hy, bh, by],
                                [dWxh, dWhh, dWhy, dbh, dby],
                                [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update                                                                                                                   

    p += seq_length # move data pointer                                                                                                                                                         
    n += 1 # iteration counter

iter 0, loss: 103.178362
---
 .AJSd"o'zxGlAa,MADv(NqlWfrjIHL.d.SMLuuDIo:uUvhytcG"LD)gztCq)YVyAsmNO,JdYNAGS i-WtPGNmeVOPxP"IiQ:gTtfGqvJ-cmFwLnUGy:NPdA'(rmoJ"Viub.MBCAsq Lu.QxiIMCL;enwa p.v;Br)UIUzaTSeOPAOwcvf:inSP?)pPUJjçSDYO! fWbS 
-----
iter 1000, loss: 83.983083
---
 ls wolt theion che himy fe hmridz reu"hr pe s inBt wano hasw pet the ham  gant chaslss hheuss ans the wing bn saul wisuuunt t ieaed ibhen ar. wGasvhe tfFade onrpl toeor ansdiughom te npe oyr b tocces  
-----
iter 2000, loss: 68.527710
---
 tigotherito waut nhedlmes tio Ghe.r wGour Auerof i(let ant, ther dilre hoctor whs dre hlasl akt on hesver rilo'ps tht hat ans hoved hurint toad hioned ab.cGrme ghatreio huteer. to ther ous as tocy rcu 
-----
iter 3000, loss: 60.361233
---
 apl hor sick nuts wir sony Cot torm coud hasind ther wanr Ghes tis uvever hip bus hik the shend thos are. Hhovepry hcinledualen
was jporf as,h nner hiit puwhighind war, he touk wat; ibsdy fhe walt tha 
-----
iter 4000, loss: 55.704229
---
  Hedict fo

KeyboardInterrupt: 