In [None]:
# Math of LSTM Networks
# https://github.com/llSourcell/LSTM_Networks/blob/master/LSTM%20Demo.ipynb
# http://colah.github.io/posts/2015-08-Understanding-LSTMs/
import numpy as np

class RecurrentNeuralNetwork:
    #input (word), expected output (next workd), num of words, (num of recurrences), array expected outputs
    # learning rate
    def __init__(self, xs, ys, rl, eo, lr):
        # initial input (first word)
        self.x = np.zeros(xs) #(input)
        # input size
        self.xs = xs #(input_size)
        # expected output (next word)
        self.y = np.zeros(ys) #(output)
        # output size 
        self.ys = ys #(output_size)
        # Weights matrix for interpreting results from LSTM cell (num words x num words matrix)
        self.w = np.random.random((ys, ys)) # Weights between INPUT and HIDDEN MATRIX #(weight)
        # matrix used in RMSprop
        self.G = np.zeros_like(self.w) # Technic for gradient descent to decay the learning rate
        # length of the recurrent network - number of recurrences ie num of words
        self.rl = rl # (total_words)
        # learning rate
        self.lr = lr #(learning_rate)
        
        # INPUTS
        # array for storing inputs
        self.ia = np.zeros((rl + 1, xs)) # (input_array)
        # array for storing cell states
        self.ca = np.zeros((rl + 1, ys)) # (cell_state_array)
        # array for storing outputs
        self.oa = np.zeros((rl + 1, ys)) # (output_array)
        # array for storing hidden states
        self.ha = np.zeros((rl + 1, ys)) # (hidden_state_array)
        
        # GATE VALUES
        # forget gate
        self.af = np.zeros((rl + 1, ys)) # (forget_gate_array)
        # input gate
        self.ai = np.zeros((rl + 1, ys)) # (input_gate_array)
        # cell gate
        self.ac = np.zeros((rl + 1, ys)) # (cell_gate_array)
        # output gate
        self.ao = np.zeros((rl + 1, ys)) # (output_gate_array)
        
        # array of expected output values
        self.eo = np.vstack((np.zeros(eo.shape[0]), eo.T)) # (expected_output)
        # declare LSTM cell (input, output, amount of recurrence, learning rate)
        self.LSTM = LSTM(xs, ys, rl, lr)
        
    # activation function. Simple non linearity, converts nums into probabilities between o and 1
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    # Derivative of sigmoid function used to compute gradients for backpropagation
    def dsigmoid(self, x):
        return self.sigmoid(x) * (1 - self.sigmoid(x))
    
    # Lets apply a series of matrix operations to our input to compute (current word) a predicted output (next word)
    def forwardProp(self):
        for i in range(1, self.rl + 1):
            self.LSTM.x = np.hstack((self.ha[i-1], self.x))  # Combination of previous output and previous hidden state
            cs, hs, f, inp, c, o = self.LSTM.forwardProp() # Computed cell state, hidden state, forget gate, cell state, output
            # store computed cell state
            self.ca[i] = cs
            self.ha[i] = hs
            self.af[i] = f
            self.ai[i] = inp
            self.ac[i] = c
            self.ao[i] = o
            self.oa[i] = self.sigmoid(np.dot(self.w, hs))
            self.x = self.eo[i-1]
        return self.oa
    
    def backProp(self):
        # Update our weight matrices (Both in our Recurrent network, as well as the weight matrices inside LSTM cell)
        # init an empty error value
        totalError = 0
        # initialize matrices for gradient updates
        # first, these are RNN level gradients
        # cell state
        dfcs = np.zeros(self.ys)
        # hidden state
        dfhs = np.zeros(self.ys)
        # weight matrix
        tu = np.zeros((self.ys, self.ys))
        # Next, these are LSTM level gradients
        # forget gate
        tfu = np.zeros((self.ys, self.xs + self.ys))
        # input gate
        tiu = np.zeros((self.ys, self.xs + self.ys))
        # cell unit
        tcu = np.zeros((self.ys, self.xs + self.ys))
        # output gate
        tou = np.zeros((self.ys, self.xs + self.ys))
        
        for i in range(self.rl, -1, -1):
            # error = calculated output - expected output
            error = self.oa[i] - self.eo[i]
            
            # Calculate update for weight matrix
            # (error * derivative of the output) * hidden state
            tu += np.dot(np.atleast_2d(error * self.dsigmoid(self.oa[i])), np.atleast_2d(self.ha[i]).T)
            
            # Time to propagate error back to exit of LSTM cell
            # 1. error * RNN weight matrix
            error = np.dot(error, self.w)
            
            # 2. set input values of LSTM cell for recurrence i (horizontal stack of arrays, hidden + input)
            self.LSTM.x = np.hstack((self.ha[i-1], self.ia[i]))
            
            # 3. Set cell state of LSTM cell for recurrence i (pre-updates)
            self.LSTM.cs = self.ca[i]
            
            # Finally, call the LSTM cell's backprop, retrieve gradient updates
            # gradient updates for forget, input, cell unit, and output gates + cell states and hidden states
            fu, iu, cu, ou, dfcs, dfhs = self.LSTM.backProp(error, 
                self.ca[i-1], self.af[i], self.ai[i], self.ac[i], self.ao[i], dfcs, dfhs)
            
            # Calculate total error(not necessary, used to measure training progress)
            totalError += np.sum(error)
            
            # Accumulate all gradient updates
            # Forget Gate
            tfu += fu
            # Input gate
            tiu += iu
            # Cell State
            tcu += cu
            # Output Gate
            tou += ou
            pass
        
        # Update LSTM matrices with average of accumulated gradient upddates
        self.LSTM.update(tfu/self.rl, tiu/self.rl, tcu/self.rl, tou/self.rl)
        # Update weight matrix with average of accumulated gradient updates
        self.update(tu/self.rl)
        # return total error of this iteration
        return totalError
    
    def update(self, u):
        # Vanilla implementation of RMSProp
        self.G = 0.9 * self.G + 0.1 * u**2
        self.w -= self.lr/np.sqrt(self.G + 1e-8) * u
        return
    
    # This is where we generate some sample text after having fully trained our model
    # ie. error is below some threshold
    def sample(self):
        # loop through recurrences . - start at 1 so the oth entry of all array will be an array of 0s
        for i in range(1, self.rl+1) :
            # Set input for LSTM cell, combination of input (previous ouput) and previous hidden state
            self.LSTM.x = np.hstack((self.ha[i-1], self.x))
            
            # Run forward prop on the LSTM cell, retrieve cell state and hidden state
            cs, hs, f, inp, c, o = self.LSTM.forwardProp()
            
            # Store input as vector
            maxI = np.argmax(self.x)
            self.x = np.zeros_like(self.x)
            self.x[maxI] = 1
            self.ia[i] = self.x #USe np.argmax?
            
            # Store cell states
            self.ca[i] = cs
            # Store hidden state
            self.ha[i] = hs
            # Forget gate
            self.af[i] = f
            # Input gate
            self.ai[i] = inp
            # Cell state
            self.ac[i] = c
            # Output gate
            self.ao[i] = o
            
            # Calculate output by multiplying hidden state with weight matrix
            self.oa[i] = self.sigmoid(np.dot(self.w, hs))
            
            # Compute new input
            maxI = np.argmax(self.oa[i])
            newX = np.zeros_like(self.x)
            newX[maxI] = 1
            self.x = newX
        # return all outputs
        return self.oa
    

    

In [None]:
class LSTM:
    
    # LSTM cell (input, output, amount of recurrence, learning rate)
    def __init__(self, xs, ys, rl, lr):
        # input is word length x word length
        self.x = np.zeros(xs+ys) #(input)
        
        #input size is word length + word length
        self.xs = xs + ys #(input_size)
        # Output
        self.y = np.zeros(ys) #(output)
        # Output size
        self.ys = ys #(output_size)
        # Cell state initialized as size of prediction
        self.cs = np.zeros(ys) #(cell_state)
        # How often to perform recurrence
        self.rl = rl #(total_words)
        # Balane the rate of training (learning rate)
        self.lr = lr #(learning_rate)
        # Init weight matrices for our gates
        # Forget gate
        self.f = np.random.random((ys, xs+ys)) #(forget_gate)
        # input gate
        self.i = np.random.random((ys, xs+ys)) #(input_gate)
        # cell state
        self.c = np.random.random((ys, xs+ys)) #(cell)
        # Output gate
        self.o = np.random.random((ys, xs+ys)) #(ouput_gate)
        # Forget gate gradient 
        self.Gf = np.zeros_like(self.f) #(forget_gate_gradient)
        # Input gate gradient
        self.Gi = np.zeros_like(self.i) #(input_gate_gradient)
        # Cell state gradient 
        self.Gc = np.zeros_like(self.c) #(cell_state_gradient)
        # Output gate gradient
        self.Go = np.zeros_like(self.o) #(output_gate_gradient)
        
        
        
        
    #activation function to activate our forward prop, just like in any type of neural network
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    #derivative of sigmoid to help computes gradients
    def dsigmoid(self, x):
        return self.sigmoid(x) * (1 - self.sigmoid(x))
    
    #tanh! another activation funciton, often used in LSTM cells
    # Having stronger gradients: Since date is centered around 0
    # the derivatives are higher. To see this, calculate the derivative 
    # of the tanh function and notice that input values are in the range [0, 1]
    def tangent(self, x):
        return np.tanh(x)
    
    # Derivative for computing gradients
    def dtangent(self, x):
        return 1 - np.tanh(x)**2
    
    # Lets compute a series of matrix multiplications to convert our input into our output
    def forwardProp(self):
        f = self.sigmoid(np.dot(self.f, self.x))
        self.cs *= f
        i = self.sigmoid(np.dot(self.i, self.x))
        c = self.tangent(np.dot(self.c, self.x))
        self.cs += i * c
        o = self.sigmoid(np.dot(self.o, self.x))
        self.y = o * self.tangent(self.cs)
        return self.cs, self.y, f, i, c, o
    
    def backProp(self, e, pcs, f, i, c, o, dfcs, dfhs):
        # Error = error + hidden state derivative. Clip the value between -6 and 6
        e = np.clip(e + dfhs, -6, 6)
        # Multiply error by activated cell state to compute output derivative
        do = self.tangent(self.cs) * e
        # Output update = (output deriv * activated output) * input
        ou = np.dot(np.atleast_2d(do * self.dtangent(o)).T, np.atleast_2d(self.x))
        # Deriv of cell state = error * output * deriv of cell state + deriv cell
        dcs = np.clip(e * o * self.dtangent(self.cs) + dfcs, -6, 6)
        # Deriv of cell = deriv cell state * input
        dc = dcs * i
        # Cell update = deriv cell * activated cell * input
        cu = np.dot(np.atleast_2d(dc * self.dtangent(c)).T, np.atleast_2d(self.x))
        # Deriv of input = deriv cell state * cell
        di = dcs * c
        # input update = (deriv input * activated input) * input
        iu = np.dot(np.atleast_2d(di * self.dsigmoid(i)).T, np.atleast_2d(self.x))
        # Deriv forget = deriv cell state * all cell states
        df = dcs * pcs
        # Forget update = (deriv forget * deriv forget) * . input
        fu = np.dot(np.atleast_2d(df * self.dsigmoid(f)).T, np.atleast_2d(self.x))
        # Deriv celll state = deriv cell state * forget
        dpcs = dcs * f
        # Deriv hidden state = (deriv cell * cell) * ouput + deriv output * output * ouput deriv input * 
        # input * ouput + deriv forget
        # forget * output
        dphs = np.dot(dc, self.c)[:self.ys] + np.dot(do, self.o)[:self.ys] + np.dot(di, self.i)[:self.ys] + np.dot(df, self.f)[:self.ys]
        # Return update gradients for forget, input, cell, output, cell state, hidden state
        return fu, iu, cu, ou, dpcs, dphs
    
    def update(self, fu, iu, cu, ou):
        # Update forget, input, cell and output gradients
        self.Gf = 0.9 * self.Gf + 0.1 * fu**2
        self.Gi = 0.9 * self.Gi + 0.1 * iu**2
        self.Gc = 0.9 * self.Gc + 0.1 * cu**2
        self.Go = 0.9 * self.Go + 0.1 * ou**2
        
        # Update our gates using our gradients
        self.f -= self.lr/np.sqrt(self.Gf + 1e-8) * fu
        self.i -= self.lr/np.sqrt(self.Gi + 1e-8) * iu
        self.c -= self.lr/np.sqrt(self.Gc + 1e-8) * cu
        self.o -= self.lr/np.sqrt(self.Go + 1e-8) * ou
        return
     
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
       

In [None]:
def LoadText():
    # Open text and return input and output data (seriew of words)
    with open("eminem.xtxt", "r") as text_file:
        data = text_file.read()
        
    text = list(data)
    outputSize = len(text)
    data = list(set(text)) # This makes unique list of words
    uniqueWords, dataSize = len(data), len(data)
    returnData = np.zeros((uniqueWords, dataSize))
    # Make identity matrix for all letters
    for i in range(0, dataSize):
        returnData[i][i] = 1
    returnData = np.append(returnData, np.atleast_2d(data), axis=0) # Map actual alphabets in the last row of the identity matrix
    output = np.zeros((uniqueWords, outputSize))
    # Now create a huge matrix for all letters from the source
    for i in range(0, outputSize):
        index = np.where(np.asarray(data) == text[i])
        output[:,i] = returnData[0:-1, index[0]].astype(float).ravel()
    return returnData, uniqueWords, output, outputSize, data

# Write the predicted ouput (series of words) to disk
def ExportText(output, data):
    finalOutput = np.zeros_like(output)
    prob = np.zeros_like(output[0])
    outputText = ""
    print(len(data))
    print(output.shape[0])
    for i in range(0, output.shape[0]): 
        for j in range(0, output.shape[1]):
            prob[j] = output[i][j] / np.sum(output[i])
        outputText += np.random.choice(data, p=prob)
    with open("output.txt", "w") as text_file:
        text_file.write(outputText)
    return

In [None]:
# Begin program
print("Beginning")
iterations = 5000
learningRate = 0.001

# Load input ouput data (words)
returnData, numCategories, expectedOutput, outputSize, data = LoadText()
print("Done Reading")

# Init our RNN using our hyperparams and dataset
RNN = RecurrentNeuralNetwork(numCategories, numCategories, outputSize, expectedOutput, learningRate)


# Training Time!
for i in range(1, iterations):
    # Compute predicted next word
    RNN.forwardProp()
    # Update all our weights using our error
    error = RNN.backProp()
    # One our error/loss is small enough
    print("Error on iteration ", i, ": ", error)
    if error > -100 and error < 100 or i % 100 == 0:
        # We can finally define a seed word
        seed = np.zeros_like(RNN.x)
        maxI = np.argmax(np.random.random(RNN.x.shape))
        seed[maxI] = 1
        RNN.x = seed
        # and predict some new text!
        output = RNN.sample()
        print(output)
        # Write it all to disk
        ExportText(output, data)
        print("Done Writing")
print("Complete")