In [8]:
import numpy as np

# data I/O
data = open('input.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print ('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

data has 453 characters, 33 unique.


In [9]:
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

In [10]:
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1))

In [11]:
print(Wxh.shape,Whh.shape,Why.shape)

(100, 33) (100, 100) (33, 100)


In [12]:
def lossFun(inputs, targets, hprev):
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
        ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # backprop into h
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
        dbh += dhraw
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        dhnext = np.dot(Whh.T, dhraw)
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [13]:
def sample(h, seed_ix, n):
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

In [None]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
while True:
    if p+seq_length+1 >= len(data) or n == 0: 
        hprev = np.zeros((hidden_size,1)) # reset RNN memory
        p = 0 # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
    if n % 1000 == 0:
        sample_ix = sample(hprev, inputs[0], 200)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n %s \n----' % (txt, ))
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 100 == 0: print ('iter %d, loss: %f' % (n, smooth_loss)) # print progress
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter 


----
 gw,yfyNou-, sE,ygxrfr,:aopTwlxavt,febtac-wdehCwgCgbehAlcwAe sNp.ccEmeCnpxuAyfanp,vcbeAxonhkT,fAcmve  ulnbAhmr,e-ApsbeiutknffC.xxN o:m:mramedEngNtu tEbx:hr--Er kaf.tpuCEhctp,llvEmmbeseoeawnTwNioweoblhC 
----
iter 0, loss: 87.412701
iter 100, loss: 88.925599
iter 200, loss: 87.882925
iter 300, loss: 86.687820
iter 400, loss: 85.290256
iter 500, loss: 83.740681
iter 600, loss: 81.912672
iter 700, loss: 80.031675
iter 800, loss: 78.084772
iter 900, loss: 76.163848
----
 lTf tl viupree al ss anavetiawsfcols t aley selysiod lcuxkd sind o pivluroEapr lutoomanah: drod datoucps. feoioufwodotf pimy p urodimal  cwc ceocnlt thudey ooifo hinnlsuirelrhas. leulop n dss oncbaom  
----
iter 1000, loss: 74.298882
iter 1100, loss: 72.402354
iter 1200, loss: 70.528724
iter 1300, loss: 68.692699
iter 1400, loss: 66.916666
iter 1500, loss: 65.143232
iter 1600, loss: 63.390896
iter 1700, loss: 61.639149
iter 1800, loss: 59.988222
iter 1900, loss: 58.360179
----
 a d erkc d tmesy eivearts t imiriugles, 

----
 ts, performs a not pron Each neuron les mrmilar to ordinary Neiral areclimile fawe fasss ot or. ut ble od Networks a dit st theurav thes andit ouchasss ablore made frnm labial bleigdild ther. And to i 
----
iter 17000, loss: 4.000318
iter 17100, loss: 4.716504
iter 17200, loss: 4.812916
iter 17300, loss: 4.768997
iter 17400, loss: 4.593575
iter 17500, loss: 4.422381
iter 17600, loss: 4.259893
iter 17700, loss: 4.109305
iter 17800, loss: 3.971933
iter 17900, loss: 3.839396
----
 Convolrod it ord ble weif peccork s s it s chaprenctiorms ane rad thale leApfane rnd axple fun ofy. The werfunction: from the reinearetyeineareafawes. Eaptecd tlorallchapfonetwofupunchavionchaprenctio 
----
iter 18000, loss: 3.897572
iter 18100, loss: 4.588131
iter 18200, loss: 4.785846
iter 18300, loss: 4.746474
iter 18400, loss: 4.591559
iter 18500, loss: 4.405829
iter 18600, loss: 4.224680
iter 18700, loss: 4.057462
iter 18800, loss: 3.897307
iter 18900, loss: 3.751604
----
 vt one rnt ardi lxsiolill si

----
 s at the other. And they still have a loss furyifune end ther. And they sto il ses a nth vend lxbl sim ne le orathave. The whole network still expresses a single differentiable score function:larct an 
----
iter 34000, loss: 1.426900
iter 34100, loss: 1.380198
iter 34200, loss: 1.806300
iter 34300, loss: 2.818681
iter 34400, loss: 2.852844
iter 34500, loss: 2.698708
iter 34600, loss: 2.550417
iter 34700, loss: 2.418776
iter 34800, loss: 2.298956
iter 34900, loss: 2.175697
----
 ts, performs a nory from they some inputs, performs a dot st the prad it are function-linearit stmineuralls. Ap abies are neurons that have learnable weights and ptoaptith thetwiotheresurmclar t it ar 
----
iter 35000, loss: 2.219109
iter 35100, loss: 2.319166
iter 35200, loss: 2.201771
iter 35300, loss: 2.087255
iter 35400, loss: 1.980611
iter 35500, loss: 1.887470
iter 35600, loss: 1.797251
iter 35700, loss: 1.715220
iter 35800, loss: 1.639491
iter 35900, loss: 1.573199
----
 Convolutional Neural Network

----
 weights and biases. Each neuron rrm ras ond ther. And Networks fonareiliy aode r: preas a image pixels on have a loss function: from the previous chapter: they are reile op d Networks from the raw ima 
----
iter 51000, loss: 0.755261
iter 51100, loss: 0.736394
iter 51200, loss: 0.719889
iter 51300, loss: 0.703565
iter 51400, loss: 0.689594
iter 51500, loss: 0.676176
iter 51600, loss: 0.664042
iter 51700, loss: 0.653797
iter 51800, loss: 0.889113
iter 51900, loss: 1.241931
----
 s at the other. And they still have a loss futherentiabieurav The ote score function: from the raw image pixels a sing therects scoretixprodorminarys fapioy sy ses on ot Neuron receivas socor lud they 
----
iter 52000, loss: 1.287790
iter 52100, loss: 1.236660
iter 52200, loss: 1.182091
iter 52300, loss: 1.129698
iter 52400, loss: 1.078984
iter 52500, loss: 1.031507
iter 52600, loss: 0.988023
iter 52700, loss: 0.946711
iter 52800, loss: 0.909702
iter 52900, loss: 0.874718
----
 ts, performs a dot product a

----
 rom the raw image pixels on other. Andllther: they are made ura hao ordilary st sts, performs a dinasim Neural Networks from the previous chapter: they are made up of neurons that have learnable weigh 
----
iter 68000, loss: 0.635812
iter 68100, loss: 0.610852
iter 68200, loss: 0.587541
iter 68300, loss: 0.566425
iter 68400, loss: 0.546549
iter 68500, loss: 0.528721
iter 68600, loss: 0.512203
iter 68700, loss: 0.497129
iter 68800, loss: 0.483686
iter 68900, loss: 0.470853
----
 werfas oras from theirelse ol function: fution th able sa nav olillo s it with a non-linearity. The whole network still expresses a single dias dito ill weome inexpresses function: from the raw image  
----
iter 69000, loss: 0.459462
iter 69100, loss: 0.448676
iter 69200, loss: 0.439170
iter 69300, loss: 0.430152
iter 69400, loss: 0.422209
iter 69500, loss: 0.414747
iter 69600, loss: 0.408137
iter 69700, loss: 0.402515
iter 69800, loss: 0.396936
iter 69900, loss: 0.392187
----
 s at the other. And they sti

----
 : they are made up of neurons that have learnable weights and biases. Each neuron receives some inputs, performs a dot product and optionally follows it with a non-linearity. The whele lerctl rave d n 
----
iter 85000, loss: 0.307530
iter 85100, loss: 0.306620
iter 85200, loss: 0.307711
iter 85300, loss: 0.396900
iter 85400, loss: 0.809733
iter 85500, loss: 0.781821
iter 85600, loss: 0.794610
iter 85700, loss: 0.752978
iter 85800, loss: 0.711819
iter 85900, loss: 0.673987
----
 rom the raw image pixels on one end to class scores at the other. And they aresue ne s a honetiolhe oco e it with a non-linearity. The whole network still expresses a single differentiable scor. And t 
----
iter 86000, loss: 0.638364
iter 86100, loss: 0.606409
iter 86200, loss: 0.576339
iter 86300, loss: 0.549526
iter 86400, loss: 0.524243
iter 86500, loss: 0.501654
iter 86600, loss: 0.480755
iter 86700, loss: 0.461916
iter 86800, loss: 0.445298
iter 86900, loss: 0.429626
----
 weights and biases. Each neu

iter 101900, loss: 0.258875
----
 xpresses a single differentiable score function: from the raw ils frnm thes on one end to class scores at the other. And they still have a loss function: fry sime hevlear itiagious chapter: they are f 
----
iter 102000, loss: 0.258313
iter 102100, loss: 0.258227
iter 102200, loss: 0.257292
iter 102300, loss: 0.257169
iter 102400, loss: 0.256292
iter 102500, loss: 0.256174
iter 102600, loss: 0.255203
iter 102700, loss: 0.255044
iter 102800, loss: 0.254492
iter 102900, loss: 0.253896
----
 : they are made up of neurons that have learnable weights and biases. Each neuron receives soutias on one end to class scores at the other. And they still have learom the previous chapter: they are ma 
----
iter 103000, loss: 0.254141
iter 103100, loss: 0.839529
iter 103200, loss: 1.343901
iter 103300, loss: 1.704827
iter 103400, loss: 1.738844
iter 103500, loss: 1.619490
iter 103600, loss: 1.499488
iter 103700, loss: 1.387444
iter 103800, loss: 1.284743
iter 103900, l