In [59]:
import copy, numpy as np

data = open('HP1.txt','r', encoding="utf8").read();
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(data_size,", ",vocab_size)

431677 ,  79


In [60]:
# Dictionary of input chars & indices
char_to_ix = {ch:i for i, ch in enumerate(chars)}
ix_to_char = {i:ch for i,ch in enumerate(chars)}
print(char_to_ix)
print(ix_to_char)

{'d': 0, 'f': 1, 'g': 2, '\n': 3, 'B': 4, '.': 5, 'a': 6, 'E': 7, 'L': 8, 'M': 9, 'v': 10, 'G': 11, '!': 12, ' ': 13, 'o': 14, 'u': 15, ':': 16, '6': 17, 'N': 18, 'i': 19, 'e': 20, 'O': 21, 'F': 22, 'q': 23, '2': 24, 'P': 25, 'm': 26, 'Z': 27, ';': 28, 'w': 29, 'X': 30, 'k': 31, '5': 32, '\t': 33, 'l': 34, 's': 35, 'U': 36, 'x': 37, 't': 38, 'D': 39, '0': 40, '8': 41, 'T': 42, 'c': 43, '-': 44, '(': 45, 'J': 46, "'": 47, 'R': 48, 'I': 49, '9': 50, 'z': 51, '?': 52, ',': 53, 'S': 54, 'y': 55, 'H': 56, 'p': 57, '3': 58, '*': 59, 'V': 60, ')': 61, '\\': 62, '4': 63, '7': 64, 'j': 65, 'A': 66, 'Y': 67, 'Q': 68, '"': 69, 'r': 70, 'b': 71, 'h': 72, 'C': 73, 'W': 74, 'K': 75, 'n': 76, '1': 77, '~': 78}
{0: 'd', 1: 'f', 2: 'g', 3: '\n', 4: 'B', 5: '.', 6: 'a', 7: 'E', 8: 'L', 9: 'M', 10: 'v', 11: 'G', 12: '!', 13: ' ', 14: 'o', 15: 'u', 16: ':', 17: '6', 18: 'N', 19: 'i', 20: 'e', 21: 'O', 22: 'F', 23: 'q', 24: '2', 25: 'P', 26: 'm', 27: 'Z', 28: ';', 29: 'w', 30: 'X', 31: 'k', 32: '5', 33: '\

In [61]:
# demo of onehot encoding
vector_for_char_a = np.zeros((vocab_size,1))
vector_for_char_a[char_to_ix['a']] = 1
print(vector_for_char_a.ravel())

[ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.]


In [62]:
# Hyperparams
hidden_size = 100
seq_length = 25
learning_rate = 1e-1
# the lower the learning rate, the quicker the network abandons old belief for new input
# e.g. train images on dogs, give a cat, low learning rate will consider cat is anormally rather than dog

# Model params
Wxh = np.random.randn(hidden_size, vocab_size*2)* 0.01 # input to hidden (input is onehot encoded)
Whh = np.random.randn(hidden_size, hidden_size)* 0.01 # recurrent hidden .
Why = np.random.randn(vocab_size,  hidden_size)* 0.01 # hidden to output(decode the output)

Bxh = np.zeros((hidden_size,1))
Bhy = np.zeros((vocab_size,1))


In [63]:
# softmax helper
def softmax(seq):
    return np.exp(seq)/ np.sum(np.exp(seq))

def softmax_array(two_D_seq,t):
    return np.exp(two_D_seq[t])/ np.sum(np.exp(two_D_seq[t]))

In [64]:
# Loss function - training
def lossFunction(inputs, targets, prev_hidden):
    # p is softmax probability
    xs, hs, ys, ps = {},{},{},{};
    
    hs[-1] = copy.deepcopy(prev_hidden)
    loss = 0;
    
    # Fwd pass    
    for t in range(len(inputs)-1):
        # One hot encoding for the input char using our dictionary
        xs[t] = np.zeros((vocab_size*2,1));
        xs[t][inputs[t]] = 1;
        xs[t][inputs[t+1]+vocab_size] = 1;
        
        hs[t] = np.tanh(np.dot(Wxh,xs[t]) + np.dot(Whh,hs[t-1]) + Bxh);
        ys[t] = np.dot(Why,hs[t]) + Bhy;
        ps[t] = softmax_array(ys,t);
        char_idx = targets[t]
        loss += -np.log(ps[t][char_idx,0]) 
        # ps[t][targets[t]] is the prob. node corrs. to. t_th char in the label array

        
    # Gradient value holders
    dWxh, dWhh, dWhy = np.zeros_like(Wxh),np.zeros_like(Whh),np.zeros_like(Why)
    dBxh, dBhy = np.zeros_like(Bxh),np.zeros_like(Bhy)
    dhnext = np.zeros_like(hs[0])
    
    # Bwd pass
    for t in reversed(range(len(inputs)-1)):
        dy = copy.deepcopy(ps[t])
        dy[targets[t]] -= 1 # this is how we calculate loss using onehot encoding
         
        dWhy += np.dot(dy, hs[t].T);
        
        dBhy += dy # derivative w.r.t bias is 1
        
        dh = np.dot(Why.T,dy) + dhnext # back prop the error from y into h
        dhraw = (1-hs[t]*hs[t])*dh # back prop thru tanh
        
        dBxh += dhraw  # derivative of Wx+b w.r.t b is 1; d_loss/d_b = d_loss/d_H * d_H/d_b
        dWxh += np.dot(dhraw, xs[t].T) # derivative of Wx+b w.r.t W is x
        dWhh += np.dot(dhraw,hs[t-1].T)
        dhnext = np.dot(Whh.T, dhraw)

    # Can be replaced using LSTM structure
    for dparam in [dWxh, dWhh, dWhy, dBxh, dBhy]:
        np.clip(dparam,-5,5,out=dparam) # mitigate gradient vanish
        
        
    return loss,dWxh,dWhh, dWhy, dBxh, dBhy, hs[len(inputs)-2];


 

In [65]:
# prediction
def sample(h,seed_ix1,seed_ix2,n):
    x = np.zeros((vocab_size*2,1))
    # one hot encode
    x[seed_ix1] = 1;
    x[seed_ix2+vocab_size] = 1;
    ixes = [] # empty sentence

    for t in range(n):

        h = np.tanh(np.dot(Wxh,x) + np.dot(Whh,h) + Bxh);
        y = np.dot(Why,h) + Bhy;
        p = softmax(y);

        # sample the output
        seed_ix1 = seed_ix2;
        seed_ix2 = np.random.choice(range(vocab_size), p=p.ravel())
        
        # encode this output
        x = np.zeros((vocab_size*2,1))
        x[seed_ix1] = 1;
        x[seed_ix2+vocab_size] = 1;

        ixes.append(seed_ix2)
        # if n > 1, it will predict more than 1 subsequent chars
        
    txt = ''.join(ix_to_char[seed_ix2] for seed_ix2 in ixes)
    print ("----\n %s \n----" % (txt,))
    
    

In [66]:
# Training using Adagrad (decreasing learning rate)
position = 0
inputs = [char_to_ix[ch] for ch in data[position:position+seq_length]]
print(data[position:position+seq_length])
print("inputs",inputs)
targets = [char_to_ix[ch] for ch in data[position+2:position+seq_length+2]] 
print(data[position+2:position+seq_length+2])
print("targets",targets)



Harry Potter and the Sorc
inputs [56, 6, 70, 70, 55, 13, 25, 14, 38, 38, 20, 70, 13, 6, 76, 0, 13, 38, 72, 20, 13, 54, 14, 70, 43]
rry Potter and the Sorcer
targets [70, 70, 55, 13, 25, 14, 38, 38, 20, 70, 13, 6, 76, 0, 13, 38, 72, 20, 13, 54, 14, 70, 43, 20, 70]


In [None]:
# Real training

n, position = 0,0

# for Adaptive Gradient descent
mWxh = np.zeros_like(Wxh);
mWhh = np.zeros_like(Whh);
mWhy = np.zeros_like(Why);
mBxh = np.zeros_like(Bxh);
mBhy = np.zeros_like(Bhy);

smooth_loss = -np.log(1.0/vocab_size)*seq_length;

epoch = 100*1000;
sample_length = 200;

while n<epoch:
    
    if(position+seq_length+1 >= len(data) or n == 0):
        
        hprev = np.zeros((hidden_size,1))
        position = 0;
        
    inputs = [char_to_ix[ch] for ch in data[position:position+seq_length]]
    #print(inputs)
    targets = [char_to_ix[ch] for ch in data[position+2:position+seq_length+2]] 
    loss,dWxh,dWhh, dWhy, dBxh, dBhy, hprev = lossFunction(inputs,targets,hprev)  
    smooth_loss = smooth_loss*0.999+loss*0.001
    
    if(n%1000 == 0):
        print(n,"loss",smooth_loss)
        sample(hprev,inputs[0],inputs[1],sample_length);
        
    # update
    for param, dparam, mem in zip([Wxh,Whh,Why,Bxh,Bhy],
                                  [dWxh,dWhh,dWhy,dBxh,dBhy],
                                 [mWxh,mWhh,mWhy,mBxh,mBhy]):
        
        mem += dparam*dparam;
        param += -learning_rate * dparam *  1 / np.sqrt(mem +1e-8) # Adagrad
        
    position += seq_length;
    n += 1;
    
        

0 loss 109.231824929
----
 SoQ4	hN4,D2O45GP ZP7a,zE6dP3WEO))m2;;uiV~zwj3O:aLtioWcMo8,Nv*j.Ep,m:gU\QEr-DKdAB9XOqwS. g?GV?f'6QzZfRn7JDDaDcJjZCzfG;0,x		UohSBX.R?QP	7Kn1L,pnv9ClH":;e\yG64St*)l?4vc4sdzRHdIdD.2l,a5obvz~Y5jjvwy5 lUxrA 
----
1000 loss 86.0486885856
----
 n sbatore od suveneostiugce?n fird sftheen en.eDnmcrt hhly Earwt oheaaglsatn ooshed scoit elverH mgolciOt.
jceorleslesoid.";l'- trrets ac nir 	uthe tche tiyg air xhinge and", taanitva.sYonIew't ouvs d 
----
2000 loss 70.6303134308
----
 ly bubdey sapreenge of hor,. Hand tibly he shendR- Hauroil. AI
moley aavrt, caw Mnk end shess-n Vet shagDs vrg. Hin, Pneih kolt oqd the ned oup tanoll sbis he whar..
Dstieseeidr mis Hins heos far's Au 
----
3000 loss 63.7814219395
----
 An yo ckedpered 'e thed Any Wemid soulw sakr thoucmid t init treapxing and bioc then.
Go nfoCmar'th ad at ak, breraste sipmoud bot, shi gave sryacl anlly UAg hobhe laldon tn-rgon benp ed sool tho id p 
----
4000 loss 61.0154274049
----
 eop tamer evly unt, ferd,

35000 loss 49.9602116771
----
 is nemthas agf arry brot pooum er a carir, his the nach ser tur twning, I: Mp a sudgoor thrave -- he becket hae mille as couldapingh he the looked hed rage arry sthe nore storid slack yowar Harry was  
----
36000 loss 49.9534621879
----
  trifast he Ceisledn's satc--"
"It withey. Pritser Prufferack het alys eget's rudn't rutpwas was Patr his Dimer up, iftiw not!" sfort ?"
Harryqoorgedd''r Quedredild kentiould if Mha troud ot loukn are 
----
37000 loss 49.7696268015
----
 yold slobpteat agle mo Maver forre troustry.
UPstranchanto "Bunsto tho slag. Harry-f beettomes anchey Vrou thmoning Mo the weCn aivee any any fizars. Dudne do beyou thet the was, wepey!"
Harry loo was 
----
38000 loss 50.4461217852
----
 both hed tal at yruckle. "Yaked. Dud Mod tien, ain. I knome enstaley, agoh, Hoor he Vor. EhG of ut fitthe than'l you watr yered' the for wot credm les, than'I the one be oke neard stheom ned goove tac 
----
39000 loss 50.5215024785
----
  avingh the jumdl

70000 loss 47.9265752622
----
 inged ceh it at himKeam walrey,." noachers ston.
"Wh'se laite pume oupttwastiled, inven biethey:
IR Serly sore he oinhors leker asten treis had sabpight lewn, sooked?"
"No bees urteakly lonke?"
"The h 
----
71000 loss 48.1626573477
----
 Hething nitand. They'g, speoring the him seachast gown ver Ungstarter whang thionel haid!"
The fuat enthe ticknut whon'ts yaid Prone, hethach farded ick walbe in ittingiknhirs, McG-le hads at reagen t 
----
