In [8]:
#import gnumpy as gpu

In [323]:
import numpy as np

# Helper functions
def softmax(array):
    return np.exp(array)/ np.sum(np.exp(array)) # return an array

def sigmoid(x):
    return (1/(1+np.exp(-x)))

def sigmoid_deriv(y):
    return (y*(1-y))

def tanh(x):
    return np.tanh(x)

def tanh_deriv(y):
    return 1 - pow(np.tanh(y),2)

# RNN
class myRNN:
    
    def __init__ (self, lenIn, lenOut, lenRec, sizeHidden, numHiddenLayer,inputs, targets, learningRate, biDir=0):
        
        # parameter save
        self.lenIn          = lenIn;
        self.lenOut         = lenOut;
        self.lenRec         = lenRec;
        self.sizeHidden     = sizeHidden;
        self.numHiddenLayer = numHiddenLayer;
        self.biDir          = biDir;
        self.learningRate   = learningRate;
        
        # for sampling phase
        self.x  = np.zeros(lenIn);  
        self.y  = np.zeros(lenOut); 
        self.W  = np.zeros((lenOut,sizeHidden)); # for the last fully connected layer
        self.GW = np.zeros((lenOut,sizeHidden)); # Gradient, for W-update using RMSprop
        self.Ob = np.zeros(lenOut);
        self.ObW= np.zeros(lenOut);
        
        ### There is NO need to update h and c for bwd prop. ###
        # for training phase - plus 1 because need initialize h,c (as zeros)
        self.xs = np.zeros((lenRec+1,lenIn));
        self.ys = np.zeros((lenRec+1,lenOut));
        self.cs = np.zeros((numHiddenLayer,lenRec+1,sizeHidden));
        self.hs = np.zeros((numHiddenLayer,lenRec+1,sizeHidden));
        
        # for training phase gate output storage - plus 1 for convenience of indexing
        self.fg = np.zeros((numHiddenLayer,lenRec+1,sizeHidden)) # forget gate
        self.ig = np.zeros((numHiddenLayer,lenRec+1,sizeHidden)) # input  gate
        self.og = np.zeros((numHiddenLayer,lenRec+1,sizeHidden)) # output gate
        self.mc = np.zeros((numHiddenLayer,lenRec+1,sizeHidden)) # memory cell state (candidate)
        
        # actual input 
        self.inputs = inputs;
        
        # for comparison - add one more entry at the begining since there is no output[0]
        self.targets = np.vstack((np.zeros(targets.shape[1]),targets))
        
        # LSTM class
        self.LSTM = [LSTM for i in range(numHiddenLayer)];
        
        if(numHiddenLayer == 1):
            self.LSTM[0] = LSTM(lenIn,sizeHidden,lenRec,learningRate);
            
        elif(numHiddenLayer == 2):
            lenIn1 = lenIn;
            lenIn2 = sizeHidden;
            if(biDir == 1):
                lenIn2  = lenIn+sizeHidden;
            self.LSTM[0] = LSTM(lenIn1,sizeHidden,lenRec,learningRate);
            self.LSTM[1] = LSTM(lenIn2,sizeHidden,lenRec,learningRate);
        
        else:
            lenIn1 = lenIn;
            lenIn2 = sizeHidden;
            if(biDir == 1):
                lenIn2  = lenIn;
            self.LSTM[0] = LSTM(lenIn1,sizeHidden,lenRec,learningRate);
            self.LSTM[1] = LSTM(lenIn2,sizeHidden,lenRec,learningRate);
            for i in range(2,numHiddenLayer):
                self.LSTM[i] = LSTM(lenIn2,sizeHidden,lenRec,learningRate);
                
    def update_inputs_targets(self, inputs, targets):
        self.inputs = inputs;
        self.targets = np.vstack((np.zeros(targets.shape[1]),targets))
    
    def fwd_pass(self):
        errSum = 0;
        if(self.biDir == 0):          
            for j in range(1,self.lenRec+1):
                # update input
                self.x    = self.inputs[j-1]
                self.xs[j]= self.inputs[j-1]
                
                # first layer
                self.LSTM[0].x = self.x;
                self.LSTM[0].ph= self.hs[0][j-1];
                c, h, f, i, m, o = self.LSTM[0].fwd_pass();
                self.fg[0][j] = f;
                self.ig[0][j] = i;
                self.og[0][j] = o;
                self.mc[0][j] = m;
                self.cs[0][j] = c;
                self.hs[0][j] = h;
                
                # other layers
                if(self.numHiddenLayer > 1):
                    for k in range(1,numHiddenLayer):
                        self.LSTM[k].x = self.hs[k-1][j];
                        self.LSTM[k].ph= self.hs[k][j-1];
                        c, h, f, i, m, o = self.LSTM[k].fwd_pass();
                        self.fg[k][j] = f;
                        self.ig[k][j] = i;
                        self.og[k][j] = o;
                        self.mc[k][j] = m; # memory cell, aka cell state
                        self.cs[k][j] = c;
                        self.hs[k][j] = h;  
                        
                # output layer - may replace with softmax instead
                self.ys[j] = sigmoid(np.dot(self.W,self.hs[numHiddenLayer-1][j]));
                tmp = softmax(self.ys[j]);
                char_idx = np.argmax(self.targets[j]);
                errSum += -np.log(tmp[char_idx]);  
                
            return errSum;               
        else:
            # bidirectional learning
            
            
            return
    
    def bwd_pass(self):        
        
        #errSum = 0;
        
        c_grads  = np.zeros((self.numHiddenLayer,self.sizeHidden));
        h_grads  = np.zeros((self.numHiddenLayer,self.sizeHidden));
        
        W_grad   = np.zeros((self.lenOut,self.sizeHidden));
        Ob_grad  = np.zeros(self.lenOut);    
        
        xf_grad   = np.zeros((self.sizeHidden,self.LSTM[0].lenIn));
        xi_grad   = np.zeros((self.sizeHidden,self.LSTM[0].lenIn));
        xm_grad   = np.zeros((self.sizeHidden,self.LSTM[0].lenIn));
        xo_grad   = np.zeros((self.sizeHidden,self.LSTM[0].lenIn));
        
        hf_grad   = np.zeros((self.sizeHidden,self.sizeHidden));
        hi_grad   = np.zeros((self.sizeHidden,self.sizeHidden));
        hm_grad   = np.zeros((self.sizeHidden,self.sizeHidden));
        ho_grad   = np.zeros((self.sizeHidden,self.sizeHidden));
        
        fb_grad   = np.zeros(self.sizeHidden)
        ib_grad   = np.zeros(self.sizeHidden)
        mb_grad   = np.zeros(self.sizeHidden)
        ob_grad   = np.zeros(self.sizeHidden)
        
        
        if(self.biDir == 0): 
            
            if(numHiddenLayer > 1):
                xf_grads = np.zeros((numHiddenLayer-1,self.sizeHidden,self.sizeHidden));
                xi_grads = np.zeros((numHiddenLayer-1,self.sizeHidden,self.sizeHidden));
                xm_grads = np.zeros((numHiddenLayer-1,self.sizeHidden,self.sizeHidden));
                xo_grads = np.zeros((numHiddenLayer-1,self.sizeHidden,self.sizeHidden));
                
                hf_grads = np.zeros((numHiddenLayer-1,self.sizeHidden,self.sizeHidden));
                hi_grads = np.zeros((numHiddenLayer-1,self.sizeHidden,self.sizeHidden));
                hm_grads = np.zeros((numHiddenLayer-1,self.sizeHidden,self.sizeHidden));
                ho_grads = np.zeros((numHiddenLayer-1,self.sizeHidden,self.sizeHidden));    
                
                fb_grads = np.zeros((numHiddenLayer-1,self.sizeHidden));
                ib_grads = np.zeros((numHiddenLayer-1,self.sizeHidden));
                mb_grads = np.zeros((numHiddenLayer-1,self.sizeHidden));
                ob_grads = np.zeros((numHiddenLayer-1,self.sizeHidden));                
                    
            # propagates through time and layers
            for j in range(self.lenRec,-1,-1):
                # output to last hidden
                err = self.targets[j] - self.ys[j];
                
                
                W_grad += np.dot((np.atleast_2d(err*sigmoid_deriv(self.ys[j])).T),np.atleast_2d(self.hs[numHiddenLayer-1][j]));
                #print(W_grad)
                err = np.dot(self.W.T, err);
                
                for k in reversed(range(1,numHiddenLayer)):
                    # setup LSTM propagation parameters
                    self.LSTM[k].x = self.hs[k-1][j];
                    self.LSTM[k].ph= self.hs[k][j-1];
                    self.LSTM[k].c = self.cs[k][j];
                    
                    xf,xi,xm,xo,\
                    hf,hi,hm,ho,c_grads[k],h_grads[k],\
                    df,di,dm,do = \
                    self.LSTM[k].bwd_pass(err, self.cs[k][j-1],self.fg[k][j],self.ig[k][j],self.mc[k][j],self.og[k][j],\
                                         c_grads[k], h_grads[k]);
                                      
                    xf_grads[k-1] +=  xf;
                    xi_grads[k-1] +=  xi;  
                    xm_grads[k-1] +=  xm;  
                    xo_grads[k-1] +=  xo;  
                    hf_grads[k-1] +=  hf;  
                    hi_grads[k-1] +=  hi;  
                    hm_grads[k-1] +=  hm;  
                    ho_grads[k-1] +=  ho;  
                    fb_grads[k-1] +=  df;
                    ib_grads[k-1] +=  di;
                    mb_grads[k-1] +=  dm;
                    ob_grads[k-1] +=  do;

                    err = np.dot(self.LSTM[k].xfW, df) + np.dot(self.LSTM[k].xiW, di) +\
                          np.dot(self.LSTM[k].xoW, do) + np.dot(self.LSTM[k].xmW, dm);
                
                self.LSTM[0].x = self.xs[j];
                self.LSTM[0].ph= self.hs[0][j-1];
                self.LSTM[0].c = self.cs[0][j];

                xf,xi,xm,xo,\
                hf,hi,hm,ho,c_grads[0],h_grads[0],\
                df,di,dm,do = \
                self.LSTM[0].bwd_pass(err, self.cs[0][j-1],self.fg[0][j],self.ig[0][j],self.mc[0][j],self.og[0][j],\
                                     c_grads[0], h_grads[0]);

                xf_grad +=  xf;
                xi_grad +=  xi;  
                xm_grad +=  xm;  
                xo_grad +=  xo;  
                hf_grad +=  hf;  
                hi_grad +=  hi;  
                hm_grad +=  hm;  
                ho_grad +=  ho;  
                fb_grad +=  df;
                ib_grad +=  di;
                mb_grad +=  dm;
                ob_grad +=  do;
                
            
            # update using RMSprop
            for k in range(1,numHiddenLayer):
                self.LSTM[k].update(xf_grads[k-1]/self.lenRec, xi_grads[k-1]/self.lenRec, \
                                   xm_grads[k-1]/self.lenRec, xo_grads[k-1]/self.lenRec, \
                                   hf_grads[k-1]/self.lenRec, hi_grads[k-1]/self.lenRec, \
                                   hm_grads[k-1]/self.lenRec, ho_grads[k-1]/self.lenRec, \
                                   fb_grads[k-1]/self.lenRec, ib_grads[k-1]/self.lenRec, \
                                   mb_grads[k-1]/self.lenRec, ob_grads[k-1]/self.lenRec,
                                   );
            self.LSTM[0].update(xf_grad/self.lenRec, xi_grad/self.lenRec, \
                               xm_grad/self.lenRec, xo_grad/self.lenRec, \
                               hf_grad/self.lenRec, hi_grad/self.lenRec, \
                               hm_grad/self.lenRec, ho_grad/self.lenRec,\
                               fb_grad/self.lenRec, ib_grad/self.lenRec, \
                               mb_grad/self.lenRec, ob_grad/self.lenRec);
            
            
            self.update(W_grad/self.lenRec);
            
          
            
    def update(self, W_grad):
        self.GW = 0.9*self.GW + 0.1*W_grad**2;
        self.W -= self.learningRate/np.sqrt(self.GW + 1e-8) * W_grad;

    def sample(self,inputs):
        if(self.biDir == 0):          

            # update input
            self.x    = inputs;
            # first layer
            self.LSTM[0].x = self.x;
            #print(self.x.shape)
            self.LSTM[0].ph= self.hs[0][0];
            c, h, f, i, m, o = self.LSTM[0].fwd_pass();
            self.fg[0][1] = f;
            self.ig[0][1] = i;
            self.og[0][1] = o;
            self.mc[0][1] = m;
            self.cs[0][1] = c;
            self.hs[0][1] = h;

            # other layers
            if(self.numHiddenLayer > 1):
                for k in range(1,numHiddenLayer):
                    self.LSTM[k].x = self.hs[k-1][1];
                    self.LSTM[k].ph= self.hs[k][0];
                    c, h, f, i, m, o = self.LSTM[k].fwd_pass();
                    self.fg[k][1] = f;
                    self.ig[k][1] = i;
                    self.og[k][1] = o;
                    self.mc[k][1] = m; # memory cell, aka cell state
                    self.cs[k][1] = c;
                    self.hs[k][1] = h;  

            # output layer - may replace with softmax instead
            self.ys[1] = sigmoid(np.dot(self.W,self.hs[numHiddenLayer-1][1]));
            #print(self.W)
            maxIdx = np.argmax(self.ys[1])
            return maxIdx;

        else:
            # bidirectional learning         
            return    


In [324]:
class LSTM:
    
    def __init__ (self,lenIn,sizeHidden,lenRec,learningRate):
        self.lenIn        = lenIn
        self.sizeHidden   = sizeHidden
        self.lenRec       = lenRec
        self.learningRate = learningRate
        
        # x is x and h horizontally stacked together [x]
        self.x = np.zeros(lenIn)
        self.ph= np.zeros(sizeHidden)
        self.h = np.zeros(sizeHidden)
        self.c = np.zeros(sizeHidden)
        
        # Weight matrices
        self.xfW = np.random.random((sizeHidden,lenIn));
        self.xiW = np.random.random((sizeHidden,lenIn));
        self.xoW = np.random.random((sizeHidden,lenIn));
        self.xmW = np.random.random((sizeHidden,lenIn)); # cell state matrix(it is also a gate)
        
        self.hfW = np.random.random((sizeHidden,sizeHidden));
        self.hiW = np.random.random((sizeHidden,sizeHidden));
        self.hoW = np.random.random((sizeHidden,sizeHidden));
        self.hmW = np.random.random((sizeHidden,sizeHidden));       
                
        # biases
        self.fb = np.zeros(sizeHidden);
        self.ib = np.zeros(sizeHidden); 
        self.ob = np.zeros(sizeHidden); 
        self.mb = np.zeros(sizeHidden); 
        
        # for RMSprop only
        self.GxfW = np.random.random((sizeHidden,lenIn));
        self.GxiW = np.random.random((sizeHidden,lenIn));
        self.GxoW = np.random.random((sizeHidden,lenIn));
        self.GxmW = np.random.random((sizeHidden,lenIn));
        
        self.GhfW = np.random.random((sizeHidden,sizeHidden));
        self.GhiW = np.random.random((sizeHidden,sizeHidden));
        self.GhoW = np.random.random((sizeHidden,sizeHidden));
        self.GhmW = np.random.random((sizeHidden,sizeHidden));         
              
        self.Gfb = np.zeros(sizeHidden);
        self.Gib = np.zeros(sizeHidden); 
        self.Gob = np.zeros(sizeHidden); 
        self.Gmb = np.zeros(sizeHidden);         
             
        
    def fwd_pass(self):
        #print(self.x.shape)
        f       = sigmoid(np.dot(self.xfW, self.x) + np.dot(self.hfW, self.ph) + self.fb)
        self.c *= f
        i       = sigmoid(np.dot(self.xiW, self.x) + np.dot(self.hiW, self.ph) + self.ib)
        m       = tanh(np.dot(self.xmW, self.x)    + np.dot(self.hmW, self.ph) + self.mb)
        self.c += i * m
        o       = sigmoid(np.dot(self.xoW, self.x) + np.dot(self.hoW, self.ph)  + self.ob)
        self.h  = o * tanh(self.c)
        
        return self.c, self.h, f, i, m, o;
    
    def bwd_pass(self,error, prev_c, f, i, m, o, c_g, h_g):
        
        error = np.clip(error + h_g, -6, 6);
        
        do = tanh(self.c) * error;
        xo = np.dot(np.atleast_2d(do*tanh_deriv(o)).T, np.atleast_2d(self.x));
        ho = np.dot(np.atleast_2d(do*tanh_deriv(o)).T, np.atleast_2d(self.ph));
        
        dcs= np.clip(error*o* tanh_deriv(self.c) + c_g,-6, 6);
        dm = dcs * i;
        xm = np.dot(np.atleast_2d(dm * tanh_deriv(m)).T, np.atleast_2d(self.x));
        hm = np.dot(np.atleast_2d(dm * tanh_deriv(m)).T, np.atleast_2d(self.ph));
        
        di = dcs * m;
        xi = np.dot(np.atleast_2d(dm * sigmoid_deriv(m)).T, np.atleast_2d(self.x));
        hi = np.dot(np.atleast_2d(dm * sigmoid_deriv(m)).T, np.atleast_2d(self.ph));
        
        df = dcs * f;
        xf = np.dot(np.atleast_2d(dm * sigmoid_deriv(f)).T, np.atleast_2d(self.x));
        hf = np.dot(np.atleast_2d(dm * sigmoid_deriv(f)).T, np.atleast_2d(self.ph));
        
        c_grad = dcs * f;
        h_grad = np.dot(self.hfW, df) + np.dot(self.hiW, di) +\
                          np.dot(self.hoW, do) + np.dot(self.hmW, dm);
        
        
        return xf,xi,xm,xo, hf,hi,hm,ho, c_grad,h_grad, df,di,dm,do;
    
    def update(self, xf, xi, xm, xo, hf, hi, hm, ho, fb, ib, mb, ob):

        self.GxfW = 0.9*self.GxfW + 0.1*xf**2;
        self.GxiW = 0.9*self.GxiW + 0.1*xi**2;
        self.GxoW = 0.9*self.GxoW + 0.1*xo**2;
        self.GxmW = 0.9*self.GxmW + 0.1*xm**2;
        self.GhfW = 0.9*self.GhfW + 0.1*hf**2;
        self.GhiW = 0.9*self.GhiW + 0.1*hi**2;
        self.GhoW = 0.9*self.GhoW + 0.1*ho**2;
        self.GhmW = 0.9*self.GhmW + 0.1*hm**2;
        self.Gfb  = 0.9*self.Gfb  + 0.1*fb**2;
        self.Gib  = 0.9*self.Gib  + 0.1*ib**2;
        self.Gmb  = 0.9*self.Gmb  + 0.1*mb**2;
        self.Gob  = 0.9*self.Gob  + 0.1*ob**2;
        

In [325]:
inp = np.zeros((5,10))
tar = np.zeros((5,10))
lenIn, lenOut, lenRec, sizeHidden, numHiddenLayer, inputs,targets, learningRate, biDir = 10,10,5,8,1,inp,tar,0.1,0;
R = myRNN(lenIn, lenOut, lenRec, sizeHidden, numHiddenLayer,inputs, targets, learningRate, biDir=0)

R.fwd_pass()

R.bwd_pass()



print(R.LSTM[0].lenIn)

10


In [326]:
data = open('HP1.txt','r', encoding="utf8").read();
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(data_size,", ",vocab_size)

char_to_ix = {ch:i for i, ch in enumerate(chars)}
ix_to_char = {i:ch for i,ch in enumerate(chars)}
print(char_to_ix)
print(ix_to_char)

431677 ,  79
{'9': 0, '?': 1, 'e': 2, '*': 3, ':': 4, 'c': 5, 'n': 6, 'L': 7, '~': 8, 'X': 9, 'u': 10, 'S': 11, 'd': 12, 'T': 13, 'o': 14, 'm': 15, 'Q': 16, '\n': 17, '0': 18, '\\': 19, '"': 20, '2': 21, 'N': 22, 'C': 23, 'B': 24, 'M': 25, '7': 26, '4': 27, 'y': 28, '!': 29, 'b': 30, 'H': 31, 's': 32, 'v': 33, '\t': 34, "'": 35, 'p': 36, 'D': 37, 'j': 38, '3': 39, '(': 40, 'P': 41, 'Y': 42, 'g': 43, '1': 44, 'Z': 45, 'k': 46, 'z': 47, 'a': 48, 'R': 49, 'F': 50, 'E': 51, 'J': 52, '8': 53, 'h': 54, 'K': 55, 'q': 56, ' ': 57, '.': 58, 't': 59, 'A': 60, 'G': 61, '-': 62, '5': 63, 'f': 64, 'x': 65, 'U': 66, 'W': 67, 'r': 68, 'O': 69, 'l': 70, ',': 71, 'w': 72, '6': 73, 'I': 74, 'i': 75, ')': 76, 'V': 77, ';': 78}
{0: '9', 1: '?', 2: 'e', 3: '*', 4: ':', 5: 'c', 6: 'n', 7: 'L', 8: '~', 9: 'X', 10: 'u', 11: 'S', 12: 'd', 13: 'T', 14: 'o', 15: 'm', 16: 'Q', 17: '\n', 18: '0', 19: '\\', 20: '"', 21: '2', 22: 'N', 23: 'C', 24: 'B', 25: 'M', 26: '7', 27: '4', 28: 'y', 29: '!', 30: 'b', 31: 'H', 3

In [327]:
def encode(array,num_entry):
    xs = np.zeros((len(array),num_entry))
    for i in range(len(array)):
        xs[i][array[i]] = 1; 
    return xs;

In [329]:
seq_length,position = 25,0
inputs = [char_to_ix[ch] for ch in data[position:position+seq_length]]
print(data[position:position+seq_length])
print("inputs",inputs)

targets = [char_to_ix[ch] for ch in data[position+1:position+seq_length+1]] 
print(data[position+1:position+seq_length+1])
print("targets",targets)

n,position = 0,0;
epoch = 20*1000;
lenIn, lenOut, lenRec = vocab_size,vocab_size, seq_length;
sizeHidden, numHiddenLayer = 100,1;
learningRate, biDir = 0.1,0;


R = myRNN(lenIn, lenOut, lenRec, sizeHidden, numHiddenLayer, encode(inputs,vocab_size),encode(targets,vocab_size), learningRate, biDir=0)

# training
while n<epoch:
    
    if(position+seq_length+1 >= len(data) or n == 0):
        position = 0;
        
    inputs = [char_to_ix[ch] for ch in data[position:position+seq_length]]
    targets = [char_to_ix[ch] for ch in data[position+1:position+seq_length+1]] 
    
    R.update_inputs_targets(encode(inputs,vocab_size),encode(targets,vocab_size));
    
    err = R.fwd_pass();
    
    R.bwd_pass();
    
    if(n%1000 == 0):
        print(n,"err:",err)
        seeds = encode(inputs,vocab_size);
        seed = np.array(seeds[0])
        for i in range(100):
            #print(seed.shape)
            ret = R.sample(seed);
            #print(ret);
            seed = np.zeros_like(seed);
            seed[ret] = 1;
            

    position += seq_length;
    position += seq_length;
    n += 1;

Harry Potter and the Sorc
inputs [31, 48, 68, 68, 28, 57, 41, 14, 59, 59, 2, 68, 57, 48, 6, 12, 57, 59, 54, 2, 57, 11, 14, 68, 5]
arry Potter and the Sorce
targets [48, 68, 68, 28, 57, 41, 14, 59, 59, 2, 68, 57, 48, 6, 12, 57, 59, 54, 2, 57, 11, 14, 68, 5, 2]
0 err: 109.2361963116755


KeyboardInterrupt: 