In [1]:
import copy, numpy as np
import cudamat as cm

data = open('HP1.txt','r', encoding="utf8").read();
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(data_size,", ",vocab_size)

ModuleNotFoundError: No module named 'cudamat'

In [2]:
# Dictionary of input chars & indices
char_to_ix = {ch:i for i, ch in enumerate(chars)}
ix_to_char = {i:ch for i,ch in enumerate(chars)}
print(char_to_ix)
print(ix_to_char)

{'N': 0, 't': 1, 'O': 2, ';': 3, 'h': 4, '9': 5, 'X': 6, 'd': 7, 'T': 8, 'c': 9, 's': 10, '\t': 11, '!': 12, 'E': 13, "'": 14, 'V': 15, 'J': 16, ':': 17, 'r': 18, 'j': 19, '~': 20, 'M': 21, '2': 22, 'P': 23, '\n': 24, 'W': 25, 'z': 26, '\\': 27, '?': 28, 'Z': 29, '1': 30, 'a': 31, 'S': 32, 'L': 33, 'l': 34, 'b': 35, 'x': 36, 'U': 37, '7': 38, '*': 39, '5': 40, ',': 41, 'B': 42, 'v': 43, 'G': 44, 'e': 45, ' ': 46, 'u': 47, 'A': 48, 'i': 49, '4': 50, 'Q': 51, 'Y': 52, 'F': 53, 'g': 54, '.': 55, 'p': 56, 'w': 57, 'y': 58, ')': 59, '8': 60, '6': 61, 'f': 62, 'C': 63, 'R': 64, '(': 65, 'o': 66, 'D': 67, 'm': 68, '-': 69, 'H': 70, '"': 71, 'q': 72, 'I': 73, '3': 74, 'n': 75, 'k': 76, '0': 77, 'K': 78}
{0: 'N', 1: 't', 2: 'O', 3: ';', 4: 'h', 5: '9', 6: 'X', 7: 'd', 8: 'T', 9: 'c', 10: 's', 11: '\t', 12: '!', 13: 'E', 14: "'", 15: 'V', 16: 'J', 17: ':', 18: 'r', 19: 'j', 20: '~', 21: 'M', 22: '2', 23: 'P', 24: '\n', 25: 'W', 26: 'z', 27: '\\', 28: '?', 29: 'Z', 30: '1', 31: 'a', 32: 'S', 33: 

In [3]:
# demo of onehot encoding
vector_for_char_a = np.zeros((vocab_size,1))
vector_for_char_a[char_to_ix['a']] = 1
print(vector_for_char_a.ravel())

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.]


In [4]:
# Hyperparams
hidden_size = 100
seq_length = 25
learning_rate = 1e-1
# the lower the learning rate, the quicker the network abandons old belief for new input
# e.g. train images on dogs, give a cat, low learning rate will consider cat is anormally rather than dog

# Model params
Wxh = np.random.randn(hidden_size,  vocab_size)* 0.01 # input to hidden (input is onehot encoded)
Whh = np.random.randn(hidden_size, hidden_size)* 0.01 # recurrent hidden .
Why = np.random.randn(vocab_size,  hidden_size)* 0.01 # hidden to output(decode the output)

Bxh = np.zeros((hidden_size,1))
Bhy = np.zeros((vocab_size,1))


In [5]:
# softmax helper
def softmax(seq):
    return np.exp(seq)/ np.sum(np.exp(seq))

def softmax_array(two_D_seq,t):
    return np.exp(two_D_seq[t])/ np.sum(np.exp(two_D_seq[t]))

In [7]:
# Loss function - training
def lossFunction(inputs, targets, prev_hidden):
    # p is softmax probability
    xs, hs, ys, ps = {},{},{},{};
    
    hs[-1] = copy.deepcopy(prev_hidden)
    loss = 0;
    
    # Fwd pass    
    for t in range(len(inputs)):
        # One hot encoding for the input char using our dictionary
        xs[t] = np.zeros((vocab_size,1));
        xs[t][inputs[t]] = 1; 
        
        hs[t] = np.tanh(np.dot(Wxh,xs[t]) + np.dot(Whh,hs[t-1]) + Bxh);
        ys[t] = np.dot(Why,hs[t]) + Bhy;
        ps[t] = softmax_array(ys,t);
        char_idx = targets[t]
        loss += -np.log(ps[t][char_idx,0]) 
        # ps[t][targets[t]] is the prob. node corrs. to. t_th char in the label array

        
    # Gradient value holders
    dWxh, dWhh, dWhy = np.zeros_like(Wxh),np.zeros_like(Whh),np.zeros_like(Why)
    dBxh, dBhy = np.zeros_like(Bxh),np.zeros_like(Bhy)
    dhnext = np.zeros_like(hs[0])
    
    # Bwd pass
    for t in reversed(range(len(inputs))):
        dy = copy.deepcopy(ps[t])
        dy[targets[t]] -= 1 # this is how we calculate loss using onehot encoding
         
        dWhy += np.dot(dy, hs[t].T);
        
        dBhy += dy # derivative w.r.t bias is 1
        
        dh = np.dot(Why.T,dy) + dhnext # back prop the error from y into h
        dhraw = (1-hs[t]*hs[t])*dh # back prop thru tanh
        
        dBxh += dhraw  # derivative of Wx+b w.r.t b is 1; d_loss/d_b = d_loss/d_H * d_H/d_b
        dWxh += np.dot(dhraw, xs[t].T) # derivative of Wx+b w.r.t W is x
        dWhh += np.dot(dhraw,hs[t-1].T)
        dhnext = np.dot(Whh.T, dhraw)

    # Can be replaced using LSTM structure
    for dparam in [dWxh, dWhh, dWhy, dBxh, dBhy]:
        np.clip(dparam,-5,5,out=dparam) # mitigate gradient vanish
        
        
    return loss,dWxh,dWhh, dWhy, dBxh, dBhy, hs[len(inputs)-1];


 

In [8]:
# prediction
def sample(h,seed_ix,n):
    x = np.zeros((vocab_size,1))
    # one hot encode
    x[seed_ix] = 1;
    ixes = [] # empty sentence

    for t in range(n):

        h = np.tanh(np.dot(Wxh,x) + np.dot(Whh,h) + Bxh);
        y = np.dot(Why,h) + Bhy;
        p = softmax(y);

        # sample the output
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        
        # encode this output
        x = np.zeros((vocab_size,1))
        x[ix] = 1;

        ixes.append(ix)
        # if n > 1, it will predict more than 1 subsequent chars
        
    txt = ''.join(ix_to_char[ix] for ix in ixes)
    print ("----\n %s \n----" % (txt,))

In [9]:
# Training using Adagrad (decreasing learning rate)
position = 0
inputs = [char_to_ix[ch] for ch in data[position:position+seq_length]]
print(data[position:position+seq_length])
print("inputs",inputs)
targets = [char_to_ix[ch] for ch in data[position+1:position+seq_length+1]] 
print(data[position+1:position+seq_length+1])
print("targets",targets)



Harry Potter and the Sorc
inputs [70, 31, 18, 18, 58, 46, 23, 66, 1, 1, 45, 18, 46, 31, 75, 7, 46, 1, 4, 45, 46, 32, 66, 18, 9]
arry Potter and the Sorce
targets [31, 18, 18, 58, 46, 23, 66, 1, 1, 45, 18, 46, 31, 75, 7, 46, 1, 4, 45, 46, 32, 66, 18, 9, 45]


In [10]:
# Real training

n, position = 0,0

# for Adaptive Gradient descent
mWxh = np.zeros_like(Wxh);
mWhh = np.zeros_like(Whh);
mWhy = np.zeros_like(Why);
mBxh = np.zeros_like(Bxh);
mBhy = np.zeros_like(Bhy);

smooth_loss = -np.log(1.0/vocab_size)*seq_length;

epoch = 100*1000;
sample_length = 200;

while n<epoch:
    
    if(position+seq_length+1 >= len(data) or n == 0):
        
        hprev = np.zeros((hidden_size,1))
        position = 0;
        
    inputs = [char_to_ix[ch] for ch in data[position:position+seq_length]]
    targets = [char_to_ix[ch] for ch in data[position+1:position+seq_length+1]] 
    loss,dWxh,dWhh, dWhy, dBxh, dBhy, hprev = lossFunction(inputs,targets,hprev)  
    smooth_loss = smooth_loss*0.999+loss*0.001
    
    if(n%1000 == 0):
        print(n,"loss",smooth_loss)
        sample(hprev,inputs[0],sample_length);
        
    # update
    for param, dparam, mem in zip([Wxh,Whh,Why,Bxh,Bhy],
                                  [dWxh,dWhh,dWhy,dBxh,dBhy],
                                 [mWxh,mWhh,mWhy,mBxh,mBhy]):
        
        mem += dparam*dparam;
        param += -learning_rate * dparam *  1 / np.sqrt(mem +1e-8) # Adagrad
        
    position += seq_length;
    n += 1;
    
        

0 loss 109.236196764
----
 71mRrsDnVtcFsz-eI,-IR
	24Lb8iYbsvB"OrG7Os95QfI6H'asOjiaYCcWNxNjPjA1tvn0-(Yap2YL,YXj;t?MsbQ\0Idd)comEhTc
vOF4G:XzggUY-a(?JYz8rD6Q?H	LW7
1QT,7 BxwBzuaz7KiyaDrSl9Vq8KWEp7'U)I6wj)yq 
mbYnphRcn n9'~X"~4EA1 
----
1000 loss 85.6394419564
----
  taove thstdatsotaecat tonttsimbeld
e  anild Dnt nile thovenl wisse soy wpalg hind aref rby thacsrintnasdey plf sicG, rir neyereopeca-e yirilt onessocafe held sisektocivevett yes cinGonhorelyyer yreor 
----
2000 loss 70.7535694981
----
 he w. the biss
 har one word ppoa cowthame hemelt boolacphethethe dloed yorimls hethe pDud,y bipper lopry he Voale carthis, angwathim-ann. hher an wad he wha he ms wo the  his wasrey vaspewwivit hnon  
----
3000 loss 63.6919067969
----
 d uthict thy as ths outcin ass shis how home onstatuntto ffry con, sut ce Vot sasped bthino dop cu lo his ta interes red- tt bre wrof tinger'n athowstam tor thiswhe vifh qfo boas stat aanthanca tampin 
----
4000 loss 60.8374273035
----
 ay oy mis t of un,.wDnam 

KeyboardInterrupt: 