In [20]:
import numpy as np
data = open('avengers.txt', 'r').read()

chars = list(set(data)) 
data_size, vocab_size = len(data), len(chars)
print ('data has %d chars, %d unique' % (data_size, vocab_size))

char_to_ix = { ch:i for i,ch in enumerate(chars)}
ix_to_char = { i:ch for i, ch in enumerate(chars)}
print (char_to_ix)
print (ix_to_char)

#model parameters

hidden_size = 100
seq_length = 25
learning_rate = 1e-1

Wxh = np.random.randn(hidden_size, vocab_size) * 0.01 #input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 #input to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01 #input to hidden
bh = np.zeros((hidden_size, 1))
by = np.zeros((vocab_size, 1))
            
            
            
       

data has 196865 chars, 88 unique
{'[': 0, '\xad': 1, 'V': 2, '8': 3, 't': 4, 'W': 5, ',': 6, '(': 7, 's': 8, 'g': 9, 'd': 10, '-': 11, 'b': 12, '%': 13, '7': 14, '6': 15, '|': 16, 'B': 17, '#': 18, 'A': 19, '!': 20, ']': 21, 'C': 22, 'c': 23, '.': 24, 'J': 25, 'x': 26, 'G': 27, ';': 28, 'Ö': 29, 'q': 30, 'P': 31, 'm': 32, "'": 33, 'Ã': 34, '0': 35, 'a': 36, '9': 37, 'n': 38, 'Q': 39, '4': 40, '2': 41, 'E': 42, 'y': 43, 'H': 44, '`': 45, 'U': 46, 'L': 47, 'Y': 48, ' ': 49, 'T': 50, 'p': 51, 'r': 52, '"': 53, 'R': 54, 'F': 55, '\t': 56, 'K': 57, '1': 58, 'O': 59, 'w': 60, 'f': 61, 'o': 62, 'j': 63, '&': 64, '5': 65, 'N': 66, 'i': 67, '3': 68, 'h': 69, 'I': 70, '/': 71, 'Z': 72, 'l': 73, 'k': 74, 'ö': 75, 'M': 76, 'X': 77, '\n': 78, 'z': 79, ')': 80, ':': 81, 'e': 82, 'D': 83, 'S': 84, 'u': 85, 'v': 86, '?': 87}
{0: '[', 1: '\xad', 2: 'V', 3: '8', 4: 't', 5: 'W', 6: ',', 7: '(', 8: 's', 9: 'g', 10: 'd', 11: '-', 12: 'b', 13: '%', 14: '7', 15: '6', 16: '|', 17: 'B', 18: '#', 19: 'A', 20: '

In [21]:
def lossFun(inputs, targets, hprev):
  """                                                                                                                                                                                         
  inputs,targets are both list of integers.                                                                                                                                                   
  hprev is Hx1 array of initial hidden state                                                                                                                                                  
  returns the loss, gradients on model parameters, and last hidden state                                                                                                                      
  """
  #store our inputs, hidden states, outputs, and probability values
  xs, hs, ys, ps, = {}, {}, {}, {} #Empty dicts
    # Each of these are going to be SEQ_LENGTH(Here 25) long dicts i.e. 1 vector per time(seq) step
    # xs will store 1 hot encoded input characters for each of 25 time steps (26, 25 times)
    # hs will store hidden state outputs for 25 time steps (100, 25 times)) plus a -1 indexed initial state
    # to calculate the hidden state at t = 0
    # ys will store targets i.e. expected outputs for 25 times (26, 25 times), unnormalized probabs
    # ps will take the ys and convert them to normalized probab for chars
    # We could have used lists BUT we need an entry with -1 to calc the 0th hidden layer
    # -1 as  a list index would wrap around to the final element
  xs, hs, ys, ps = {}, {}, {}, {}
  #init with previous hidden state
    # Using "=" would create a reference, this creates a whole separate copy
    # We don't want hs[-1] to automatically change if hprev is changed
  hs[-1] = np.copy(hprev)
  #init loss as 0
  loss = 0
  # forward pass                                                                                                                                                                              
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation (we place a 0 vector as the t-th input)                                                                                                                     
    xs[t][inputs[t]] = 1 # Inside that t-th input we use the integer in "inputs" list to  set the correct
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state                                                                                                            
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars                                                                                                           
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars                                                                                                              
    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)                                                                                                                       
  # backward pass: compute gradients going backwards    
  #initalize vectors for gradient values for each set of weights 
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    #output probabilities
    dy = np.copy(ps[t])
    #derive our first gradient
    dy[targets[t]] -= 1 # backprop into y  
    #compute output gradient -  output times hidden states transpose
    #When we apply the transpose weight matrix,  
    #we can think intuitively of this as moving the error backward
    #through the network, giving us some sort of measure of the error 
    #at the output of the lth layer. 
    #output gradient
    dWhy += np.dot(dy, hs[t].T)
    #derivative of output bias
    dby += dy
    #backpropagate!
    dh = np.dot(Why.T, dy) + dhnext # backprop into h                                                                                                                                         
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity                                                                                                                     
    dbh += dhraw #derivative of hidden bias
    dWxh += np.dot(dhraw, xs[t].T) #derivative of input to hidden layer weight
    dWhh += np.dot(dhraw, hs[t-1].T) #derivative of hidden layer to hidden layer weight
    dhnext = np.dot(Whh.T, dhraw) 
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients                                                                                                                 
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [18]:
import numpy as np
#prediction, one full forward pass
def sample(h, seed_ix, n):
  """                                                                                                                                                                                         
  sample a sequence of integers from the model                                                                                                                                                
  h is memory state, seed_ix is seed letter for first time step   
  n is how many characters to predict
  """
  #create vector
  x = np.zeros((vocab_size, 1))
  #customize it for our seed char
  x[seed_ix] = 1
  #list to store generated chars
  ixes = []
  #for as many characters as we want to generate
  for t in range(n):
    #a hidden state at a given time step is a function 
    #of the input at the same time step modified by a weight matrix 
    #added to the hidden state of the previous time step 
    #multiplied by its own hidden state to hidden state matrix.
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    #compute output (unnormalised)
    y = np.dot(Why, h) + by
    ## probabilities for next chars
    p = np.exp(y) / np.sum(np.exp(y))
    #pick one with the highest probability 
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    #create a vector
    x = np.zeros((vocab_size, 1))
    #customize it for the predicted char
    x[ix] = 1
    #add it to the list
    ixes.append(ix)

  txt = ''.join(ix_to_char[ix] for ix in ixes)
  print ('----\n %s \n----' % (txt, ))
hprev = np.zeros((hidden_size,1)) # reset RNN memory  
#predict the 200 next characters given 'a'
sample(hprev,char_to_ix['a'],200)

----
 
vOi 5U%Cs]aKMm3%zpO'HbDEw`W#CqLB(z1R' z,0öWj5,'kNbBöuSfa1-J9.o[6txgTs|VnG]d#MDZ­HIu!Lnnw8Q9wbeÖfp]Ev9h	3Ã/vzzbvV(N%|I[ö:bv,crpYVQ	kFjjL9dMRcAzufNZ(O`6Ã	;|q	LiÃ)I3)&&qs%vch4q0(BR|b?zu6
ÃyX6w)Z&JI90v|3 
----


In [22]:
p=0  
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
print ("inputs", inputs)
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
print ("targets", targets)

inputs [78, 56, 78, 78, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49]
targets [56, 78, 78, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49]


In [23]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad                                                                                                                
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0                                                                                                                        
while n<=1000*100:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  # check "How to feed the loss function to see how this part works
  if p+seq_length+1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size,1)) # reset RNN memory                                                                                                                                      
    p = 0 # go from start of data                                                                                                                                                             
  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

  # forward seq_length characters through the net and fetch gradient                                                                                                                          
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001

  # sample from the model now and then                                                                                                                                                        
  if n % 1000 == 0:
    print ('iter %d, loss: %f' % (n, smooth_loss)) # print progress
    sample(hprev, inputs[0], 200)

  # perform parameter update with Adagrad                                                                                                                                                     
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                [dWxh, dWhh, dWhy, dbh, dby],
                                [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update                                                                                                                   

  p += seq_length # move data pointer                                                                                                                                                         
  n += 1 # iteration counter

iter 0, loss: 111.933410
----
 )EC#Rr2Nu:%ABi6B%ÖmzlDCGlYUsfa3%ju/ZCr&Pa)x?aX2]Cb
(SDc#z(
O]s8sh|5o-SSOÖ]'H5f5tS[V)sÖn;/6Ö#`f­!4`)qq" fJ6-IF:­bQx4AT]W0;	RD&IA%Jh(]K/
jaU
/gCheCWu0Y"DZJ4q"dbUar8k|EqiÖh:0w;E)dC%u1(z4
Tqqöj9Da#5!tocp" 
----
iter 1000, loss: 89.954568
----
     e  lL b t. W
  k O   iE'el      
oa u  s  tTd  dtehb  eh  sUGA.ua     
Ie 
  Ba  R iTosom  rS e o I
elnl tt! n heA rfp  hh  MeofsNn   
W   e t y o  enl n tn . oten
 o kot  Them.NR    ashni    gpt  
----
iter 2000, loss: 74.702735
----
  a
iym uorn  on      vlt    beh      ol  o,kOnb  K nl. tsuREsm.hI  ti dOnvc  argfoe  u e,aeu.et   oea  reemh cld doiTdeE le  .
nra I?fes eog Ihr ie I . aee
t hor A,H  THokl,co  nawhta
 r as    k   ryH 
----
iter 3000, loss: 69.383321
----
 o  .!   A    st    l!    T      c o   g  Ao   e     He sh    t!       dSfrt       a'
  ta   H

  Hot n   AD YN hn   P     H    mn !KSS  . P.
 
 
 AY  ! a  ReViRAb
 t Uu   R!A  d    ad      on    v  m  
----
iter 4000, loss: 64.491214
----
    aiFe  

iter 35000, loss: 51.299178
----
  Fse t  is
                         n RORO bunitz'ys AIISbbradtsDSNTT hoyuon b. TONTT.      thse oud O                   FOIBeo
      
  Nok
                   intielzlsyin tONtaiy hutt ta MAY `hap uk 
----
iter 36000, loss: 49.728611
----
 Tps k R OOISCEoERon birg. ba  UE              onrib,
 veak stIROMILA                                     ws hetes ONY ENoue tAeak  fun
    BVoaltori fentiho fse hd HE..
      IAu hrgalnnies AG)
       
----
iter 37000, loss: 54.235828
----
  gk kier, IASI.
       TOPARssepy. mOLI NIINLE      NSut t fsealm er
               RVICHY  fo mce w  soen  'gy. SNOIIRoombr ee owarnhe ok dtgth mrejoutnpe er.
           (thissts astyt. Aulan aon nde 
----
iter 38000, loss: 52.804560
----
  goUATKAGO                         AN D
            e ems t­   DDEIT FA Hmlet, a IThong ENY SEY  otong.
         aAY PRMai gowete u ha ttoWEL
   s GER A      Goypin
      Tha bopade ghke n ie n .
     
----
iter 39000, loss: 54.754320
----
 PG

iter 69000, loss: 47.202359
----
  O Thuse Qoiy's loll CHO
                       ar c see. Y A SHOXE.R an tady  IUl
         Whloatsebl
                       MID Thir eliddy. Y YHE QIRETond.

    'anon. ton ipte. wh ek.
 rut. T AATE 
----
iter 70000, loss: 49.272767
----
 REND Ho Wut tinlku thans amn.
  retk i bRI7. HIS we.

              t tod  JE ERADw s rolg inec3ig. in  MIRIL ARD NPOKs outang ange the fat resi t atn ADAT tSEMPACAKA  rebe anlip.   FHTOH. She HKT H H 
----
iter 71000, loss: 50.207115
----
            F ILI AANS. Y cu thhes av ye hoit ar at son
         Thitd on.   Thiny,
         n wad- dlune. the the the liyan wank.
        ve o tsan mlou ats we. Hus an. a Fet..
      SACND.
    the me 
----
iter 72000, loss: 48.674539
----
 ad?
     hok Ban. SRELY

          Ttme tange ot ag ontintbe su Bar inLO
             (om wad. s aprot.
 tois pook k lo a wanty  he nesir. a bow.
    Ton foche  OY

                 BFIINLGE MTARTERA
 
----
iter 73000, loss: 46.615588
----
  i