In [30]:
import numpy as np

In [31]:
data = open('trump.txt', 'r').read().lower()

chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has {} chars, {} unique'.format(data_size, vocab_size))

data has 31784 chars, 48 unique


In [32]:
char_to_ix = { ch:i for i, ch in enumerate(chars)}
print(char_to_ix)

{'o': 0, 'n': 1, 'x': 2, '2': 3, '?': 4, '-': 5, '6': 6, 'h': 7, 'l': 8, '3': 9, 'u': 10, ',': 11, '.': 12, 'r': 13, '9': 14, 'i': 15, 'a': 16, 'm': 17, 'q': 18, '\n': 19, '(': 20, 'p': 21, 'g': 22, ']': 23, 'v': 24, '7': 25, '$': 26, 'z': 27, 'b': 28, 'j': 29, 'f': 30, 'd': 31, '0': 32, 'e': 33, '5': 34, 's': 35, 'y': 36, '4': 37, '8': 38, 'c': 39, 'w': 40, '[': 41, ')': 42, 'k': 43, 't': 44, ':': 45, ' ': 46, '1': 47}


In [33]:
ix_to_char = { i:ch for i,ch in enumerate(chars)}
print(ix_to_char)

{0: 'o', 1: 'n', 2: 'x', 3: '2', 4: '?', 5: '-', 6: '6', 7: 'h', 8: 'l', 9: '3', 10: 'u', 11: ',', 12: '.', 13: 'r', 14: '9', 15: 'i', 16: 'a', 17: 'm', 18: 'q', 19: '\n', 20: '(', 21: 'p', 22: 'g', 23: ']', 24: 'v', 25: '7', 26: '$', 27: 'z', 28: 'b', 29: 'j', 30: 'f', 31: 'd', 32: '0', 33: 'e', 34: '5', 35: 's', 36: 'y', 37: '4', 38: '8', 39: 'c', 40: 'w', 41: '[', 42: ')', 43: 'k', 44: 't', 45: ':', 46: ' ', 47: '1'}


In [34]:
vector_for_char_a = np.zeros((vocab_size, 1)) #r, c
vector_for_char_a[char_to_ix['a']] = 1
print(vector_for_char_a.ravel())

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [35]:
#Hyperparameters

hidden_size = 200
seq_length = 25 #generate 25 chars at a time
learning_rate = 1e-1

In [36]:
#Model parameters

#r, c: [0, 1]
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01 #input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 #input to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01 #input to hidden
bh = np.zeros((hidden_size, 1))
by = np.zeros((vocab_size, 1))

* _Wxh_ are parameters to connect a vector that contain one input to the hidden layer.
* _Whh_ are parameters to connect the hidden layer to itself. This is the Key of the Rnn: Recursion is done by injecting the previous values from the output of the hidden state, to itself at the next iteration.
* _Why_ are parameters to connect the hidden layer to the output
* _bh_ contains the hidden bias
* _by_ contains the output bias

Loss input:  
* List of input chars
* List of target chars
* Previous hidden state

Function output:
* Loss
* Gradient for each parameter between layers
* The last hidden state

In [37]:
def lossFunc(inputs, targets, hprev):
    #store our inputs, hidden states, outputs, and probability values
    xs, hs, ys, ps = {}, {}, {}, {}
    
    hs[-1] = np.copy(hprev)
    loss = 0
    
    #forward pass
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size, 1)) #81*1
        xs[t][inputs[t]] = 1 #1 hot
        
                            # 100*81.81*1 + ... => 100*1
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) #el wise tanh
        
                    # 81x100.100x1 => 81x1
        ys[t] = np.dot(Why, hs[t]) + by #vector output, not num! 81*1
        
        #81x1
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) #vector/scalar division, softmax of predictions
        
        #cross entropy loss, calculates loss when the correct probability is 1
        loss += -np.log(ps[t][targets[t],0]) 
        
    #backward pass
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0]) #scalar
    
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 
        #the correct value now has "negative" gradient, so it should "keep going"
        #the other values are literally their error, because they should be 0
        
        #a giant matrix of the "errors", opinions from each character 
        #on how each weight should change (more positive is higher "error")
        #81x1.1x100 => 81x100
        dWhy += np.dot(dy, hs[t].T) 

        #just use output error as derivative of output bias
        dby += dy 
        
        #backpropagate!
                #81*100.100x1 =>81x1 + scalar
        dh = np.dot(Why.T, dy) + dhnext # backprop into h           
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity                                                                                                                                                                
        
        dbh += dhraw #derivative of hidden bias
        dWxh += np.dot(dhraw, xs[t].T) #derivative of input to hidden layer weight
        dWhh += np.dot(dhraw, hs[t-1].T) #derivative of hidden layer to hidden layer weight
        dhnext = np.dot(Whh.T, dhraw) 
        
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients                                                                                                                 
      
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
        

In [38]:
#prediction, one full forward pass
def sample(h, seed_ix, n):
    """                                                                                                                                                                                         
    sample a sequence of integers from the model                                                                                                                                                
    h is memory state, seed_ix is seed letter for first time step   
    n is how many characters to predict
    """
    #create vector
    x = np.zeros((vocab_size, 1))
    #customize it for our seed char
    x[seed_ix] = 1
    #list to store generated chars
    ixes = []
    #for as many characters as we want to generate
    for t in range(n):
        #a hidden state at a given time step is a function 
        #of the input at the same time step modified by a weight matrix 
        #added to the hidden state of the previous time step 
        #multiplied by its own hidden state to hidden state matrix.
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        #compute output (unnormalised)
        y = np.dot(Why, h) + by
        ## probabilities for next chars
        p = np.exp(y) / np.sum(np.exp(y))
        #pick one with the highest probability 
        ix = np.random.choice(range(vocab_size), p=p.ravel()) #p: probability dist
        #create a vector
        x = np.zeros((vocab_size, 1))
        #customize it for the predicted char
        x[ix] = 1
        #add it to the list
        ixes.append(ix)

    txt = ''.join(ix_to_char[ix] for ix in ixes)
    print('----\n {} \n----'.format(txt))

In [39]:
hprev = np.zeros((hidden_size,1)) # reset RNN memory  
#predict the 200 next characters given 'a'
sample(hprev,char_to_ix['a'],200)

----
 gr$qg9n-pl0wssp s]w
oks??3,,)sl?um,zn.?vad[8
9y, 53hdp?c
u,w$y5ffpplx$srwj41by1ih.bjkpxc2,re]n6fg  cv5:k5vq:qmi2l(p,$qe72h9ecs-ia7iahmygewm(t14c8uleqlf bksqdu9i(u,kf)xlhsr]u)p)7w6o9[2[5 4jpsx]-t9]h)6y 
----


In [40]:
p=0  
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
print("inputs", inputs)
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
print("targets", targets)

inputs [8, 33, 35, 35, 46, 44, 7, 16, 1, 46, 0, 1, 33, 46, 36, 33, 16, 13, 46, 7, 16, 35, 46, 21, 16]
targets [33, 35, 35, 46, 44, 7, 16, 1, 46, 0, 1, 33, 46, 36, 33, 16, 13, 46, 7, 16, 35, 46, 21, 16, 35]


In [None]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad                                                                                                                
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0   
while True: #n<=1000*100:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    # check "How to feed the loss function to see how this part works
    if p+seq_length+1 >= len(data) or n == 0:
        hprev = np.zeros((hidden_size,1)) # reset RNN memory                                                                                                                                      
        p = 0 # go from start of data                                                                                                                                                             
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # forward seq_length characters through the net and fetch gradient                                                                                                                          
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFunc(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001

    # sample from the model now and then                                                                                                                                                        
    if n % 1000 == 0:
        print('iter {}, loss: {}'.format(n, smooth_loss)) # print progress
        sample(hprev, inputs[0], 200)

    # perform parameter update with Adagrad                                                                                                                                                     
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                [dWxh, dWhh, dWhy, dbh, dby],
                                [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update                                                                                                                   

    p += seq_length # move data pointer                                                                                                                                                         
    n += 1 # iteration counter

iter 0, loss: 96.78001954118726
----
 ]a,5az7i4v36:m83zejqk]i).8l)(m872
j5i5v8q1:vjrd76c:xhocvg3wb .-lqovg3)surixw.(w,(rl42c5-chcqjq70afzsqk[aho170ayp.4oeh: :0fm)1s3b?3e7g20d
sair, vaa2[9,w8sb[pkpno-dy9(g:2 l$hqfa8yq$ -u(l($e84vs856vry,]f 
----
iter 1000, loss: 91.38001849732053
----
 ge0ig,eii eme :ec    fp alee.tnesst-ta]ngmtaemtoe g,gytts2inp t y7 otataytadwarod evr,c dasneoghtdensmu ohternvo ak
rsoni p grt cer,sdb wa
 k reiei tnirtasaciania a,lflt re  t ,7l xhtnu ln  ihemtnh i  
----
iter 2000, loss: 79.76089898248762
----
 e uon he, thigrcnrnanm
4e cnmt senf aomo o,s. kraping thet.
doias il
pedrlucathng an7te dy txu o, nceo,y o aisecmlow wicize 9oeg i7y$ asli yrukwho tinellnaehas ee,sbre alrag mern ahlors pten ,e ]hhay. 
----
iter 3000, loss: 70.20436391871611
----
 feceoven gory ogon, in sh ve shorec. ihese esd anl eos  obhry cave tongu  weod pomleud etas omingehcin ansried


)ve agu pad nanve lle thehere mwhnl hind anr oecngouiusd toery amepousr on iss peresa p 
----
iter 4000, 

iter 33000, loss: 51.15521021375037
----
 in streded aler andirto one tor whata of daded wheis wounting ind deicontonth, con.
t: thatncins hhes un.

pevucedha ponger ase coseris of thes, tand wille hak thegread the.

a feve threiting anrorili 
----
iter 34000, loss: 50.46723942602622
----
  pithta, nis yousyas the coull biculationizis and nget of it thirg pugre we tostry we we reand of.

the aleldly perersententey froudbly the bnesto fus frucanser sos tor to b elk yi 250 ie.
,0. anthati 
----
iter 35000, loss: 50.170050675656036
----
  ogringrear, a peaington, treaved nosen sow st-buucsss, that beneme corming ine that caker toun younty reates angrectame ous thes. alicgen keverdtoun fore reave dus bese scraftonder mo chare, anth pro 
----
iter 36000, loss: 50.16531574467243
----
 ld taly state plongitith sy trengush farery of tanut we bepte.

a fare saed sugrealls hm nome the to pefatsarn nable-orhe heap age thest necpringnst pot bes rucen sucling tivis we hoocracl to bemated, 
----
ite