<center><b>IMPLEMENTING RNN USING NUMPY</b><center>
<hr>
<ul>
<li>https://gist.github.com/karpathy/d4dee566867f8291f086</li>
<li>https://www.youtube.com/watch?v=LHXXI4-IEns&ab_channel=TheA.I.Hacker-MichaelPhi</li>
<li>https://www.youtube.com/watch?v=6niqTuYFZLQ&ab_channel=StanfordUniversitySchoolofEngineering</li>
</ul>

RNNS have the abstract concept of sequential memory....short term memory (due to the nature of backpropagation)... vanishing and exploding 
gradients...<br>
Below is an implementation of rnn using numpy inspired from andrej karpathy's blogpost!<br><hr>
The backpropagation used here is called backpropagation through time which is a nice technique to calculate derivatives w.r.t loss function
at time step t based on t and t+1

In [43]:
with open('./shakespeare.txt', 'r') as file:
    data = file.read()

vocab = list(set(data))
ix_to_char = {i:ch for i, ch in enumerate(vocab)}
char_to_ix = {ch:i for i, ch in enumerate(vocab)}

data_size = len(data)
vocab_size = len(vocab)

In [44]:
# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

In [45]:
import numpy as np
# model parametes
Wxh = np.random.randn(hidden_size, vocab_size)*0.01
Whh = np.random.randn(hidden_size, hidden_size)*0.01
bh = np.zeros((hidden_size,1))
Why = np.random.randn(vocab_size, hidden_size)*0.01
by = np.zeros((vocab_size,1))
weights = {
    'Wxh': Wxh, 'Whh': Whh, 'bh': bh, 'Why': Why, 'by': by
}

In [46]:
def encode(char, vocab_size):
    one_hot = np.zeros((vocab_size,1))
    one_hot[char_to_ix[char]] = 1
    return one_hot

In [47]:
def forward(xs, weights, hs):
    Wxh = weights['Wxh']
    Whh = weights['Whh']
    bh = weights['bh']
    Why = weights['Why']
    by = weights['by']
    
    hs = np.tanh(np.dot(Wxh, xs) + np.dot(Whh, hs) + bh)
    ys = np.dot(Why, hs) + by
    ps = np.exp(ys)
    ps = ps/np.sum(ps)
    return hs, ys, ps

In [48]:
def one_step(input_char, target_char, weights, vocab_size, hs):
    
    xs = encode(char=input_char, vocab_size=vocab_size)
    hs, ys, ps = forward(xs, weights, hs)
    loss = -np.log(ps[char_to_ix[target_char]])
    return xs, hs, ys, ps, loss
    

In [49]:
def step(inputs, targets, weights, vocab_size, hidden_size):
    loss = 0 
    xs, ys, ps, hs = {}, {}, {}, {}
    hs[-1] = np.zeros((hidden_size,1))
    for t, input_char in enumerate(inputs):
        target_char = targets[t]
        xs[t], hs[t], ys[t], ps[t], step_loss = one_step(input_char=input_char, 
                                                         target_char=target_char,
                                                         weights=weights,
                                                         vocab_size=vocab_size,
                                                         hs=hs[t-1])
        loss+=step_loss
    
    return xs, hs, ys, ps, loss


In [50]:
def clip_grad(grads):
    for grad in grads.keys():
        grads[grad] = np.clip(grads[grad], -5, 5)
    return grads


def grad(xs, hs, ps, weights, targets):
    Wxh = weights['Wxh']
    Whh = weights['Whh']
    bh = weights['bh']
    Why = weights['Why']
    by = weights['by']

    dWhy =  np.zeros_like(Why)
    dby = np.zeros_like(by)
    dbh = np.zeros_like(bh)
    dWhh = np.zeros_like(Whh)
    dWxh = np.zeros_like(Wxh)

    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(targets))):
        target_char = targets[t]
        dy = np.copy(ps[t])
        dy[char_to_ix[target_char]]-=1
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext
        dhraw = (1-hs[t]*hs[t])*dh
        dWxh+= np.dot(dhraw, xs[t].T)
        dWhh+= np.dot(dhraw, hs[t-1].T)
        dbh+=dhraw
        dhnext = np.dot(Whh.T, dhraw)

    grads = {
        'Why': dWhy,
        'Whh': dWhh,
        'Wxh': dWxh,
        'by': dby,
        'bh': dbh
    }

    grads = clip_grad(grads)

    return grads 

In [51]:

def sample(h, xs, n, weights, vocab_size):
    xs = encode(xs, vocab_size)
    outputs = []

    for t in range(n):
        h, _, p = forward(xs, weights, h)
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        outputs.append(ix_to_char[ix])
        xs = np.zeros_like(xs)
        xs[ix]=1

    return outputs


In [52]:
n,p = 0,0
mVars = {}
for key in weights.keys():
  mVars[key] = np.zeros_like(weights[key])

smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0

while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p+seq_length+1 >= len(data) or n == 0: 
    hprev = np.zeros((hidden_size,1)) # reset RNN memory
    p = 0 # go from start of data
  inputs = data[p:p+seq_length]
  targets = data[p+1:p+seq_length+1]

  # sample from the model now and then
  if n % 100 == 0:
    txt = sample(hprev, inputs[0], 200, weights, vocab_size)
    txt = ''.join(txt)
    print(txt)

  # forward seq_length characters through the net and fetch gradient
  xs, hs, ys, ps, loss = step(inputs=inputs, targets=targets, weights=weights, 
                              vocab_size=vocab_size, hidden_size=hidden_size)
  grads = grad(xs=xs, hs=hs, ps=ps, weights=weights, targets=targets)

  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 100 == 0: 
    print(f'iter {n}, loss: {smooth_loss}') # print progress

  # perform parameter update with Adagrad
  for key in weights.keys():
    param = weights[key]
    dparam = grads[key]
    mem = mVars[key]

    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

  p += seq_length # move data pointer
  n += 1 # iteration counter 

JEyXFivQ-nwx3U-$oDyKcNvErIUxDvR'EahZEoKQERbdMaRz
-knNJWKUJjnjYXNFZJONwbhFG;Sk-Lj,kLL$pxvuZmpcdFwHQJhTRtjTQKYWC EUOUbxJqbEd.I3VDRBwig-kJyR je'LNU-TTJ dMIqPCG$K3wCA YtBR.yiOihMO& upYF,FGxYMPz :?mZSCTPs

iter 0, loss: [104.3596881]
h aocpnwghs,  el rve-o  oFfp  e tis: sQpkrec ce,cetytle
Fane:n HktiTklipsash nyaoov
 he  CYwor:  R  oSvwoSn,oh
 wre,y'e tnW; hlCne n
iAekWYhcreG
 sn oz 
ne  yWqaz Kfr.!  e
p3s tc mchs  sWvaoe oml dwmh
iter 100, loss: [104.60929757]
oileznlsae wehllwh  
hCoIrtpyrt-.ely
iuaigss
lsa ewwnhpkhne WN FsslaWhea entsfaesna mh io Nn
ye,hdtmielt nefs,npsn steus lt !oa . l
oh l- a lor yuoa tk
ihelaenyte osjlzoee agllwresw k se en,Ewu   fuut
iter 200, loss: [102.86632043]
rh, hl  
dm 
Qonc,gnsescrghil!erlm rnin.rle yy 
pewl
e tyy:n yMt oWs
VuWthleebsC
,csnWaNeng C;tasHeohv  ydo eehtCsaitmoa  etregftkgR:wleslutt sf srfoh irme
wygn:eau 
ifmru. 'ne
teu
mnlcwifpstjhmta
bel
iter 300, loss: [101.15858352]
rs iyey B, prreec eotdfe t,wieyelah dukurs-e;e ewoe oas vaw stbne 
kdtrlaen

KeyboardInterrupt: 