In [10]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
plt.style.use('seaborn-white')

# READ and PROCESS data

In [11]:

data=open("shakespear.txt","r").read()

In [12]:
chars= list(set(data))
data_size, x_size= len(data), len(chars)
print(f"data has {data_size} characters, {X_size} unique")
char2idx= {ch:i for i ,ch in enumerate(chars)}
idx2char= {i:ch for i ,ch in enumerate(chars)}

data has 99993 characters, 62 unique


# Constants and Hyperparameters

In [13]:
h_size = 100 #Size of Hidden layer 
T_steps = 25 # Time steps 
lr = 1e-1 
std = 0.1 #standard deviation for weight initialization
z_size = h_size + x_size 

# Activation Functions and it derivatives 

In [14]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def dsigmoid(y):
    return y * (1-y)

def tanh(x):
    return np.tanh(x)

def dtanh(y):
    return 1-y*y

# Parameters 

In [15]:
class Param:
    def __init__(self, name, value):
        self.name = name 
        self.v = value #parameter value 
        self.d = np.zeros_like(value) #derivative 
        self.m = np.zeros_like(value) #adagrad memoentum 
"""zeros obtains memory from the operating system so that the OS zeroes it when it is first used. 
zeros_like on the other hand fills the alloced memory with zeros by itself. B"""

'zeros obtains memory from the operating system so that the OS zeroes it when it is first used. \nzeros_like on the other hand fills the alloced memory with zeros by itself. B'

In [16]:
#We use random weights with normal distribution (0, weight_sd) for  tanh  activation function 
#and (0.5,  weight_sd) for  sigmoid  activation function.
class Parameters:
    def __init__(self):
        self.W_f = Param('W_f', np.random.randn(h_size, z_size) * std + 0.5)
        self.b_f = Param('b_f', np.zeros((h_size, 1)))
        self.W_i = Param('W_i', np.random.randn(h_size, z_size) * std + 0.5)
        self.b_i = Param('b_i', np.zeros((h_size, 1)))
        self.W_C = Param('W_C', np.random.randn(h_size, z_size) * std)
        self.b_C = Param('b_C', np.zeros((h_size, 1)))
        self.W_o = Param('W_o', np.random.randn(h_size, z_size) * std + 0.5)
        self.b_o = Param('b_o', np.zeros((h_size, 1)))
        #final prediction layer 
        self.W_v = Param('W_v', np.random.randn(x_size, h_size) * std  )
        self.b_v = Param('b_v', np.zeros((x_size, 1)))
        
    def all(self):
        return [self.W_f, self.W_i, self.W_C, self.W_o, self.W_v, self.b_f, self.b_i, self.b_C, self.b_o, self.b_v]

parameters = Parameters()



# forward

In [18]:
def forward_pass(x,h_prev,C_prev,p=parameters):
    assert x.shape() == (x_size, 1)
    assert h_prev.shape() == (h_size, 1)
    assert C_prev.shape() == (h_size, 1)
    
    z = np.vstack(h_prev,x)
    f = sigmoid(p.W_f.v@z + p.b_f.v)
    i = sigmoid(p.W_i.v@z + p.b_i.v)
    C_bar = tanh(p.W_C.v@z + p.b_C.v)
    o = sigmoid(p.W_o.v@z + p.b_o.v)
    
    C = C_prev*f + i*C_bar
    
    h = tanh(C)*o
    
    v = p.W_v.v@h + p.b_v.v
    y = np.exp(v) / np.sum(np.exp(v)) #softmax 
    
    return z, f, i, o, C, C_bar, h, v, y


    
    
    
    

In [22]:
def clear_gradients(params = parameters):
    for p in params.all():
        p.d.fill(0)

In [23]:
def clip_gradients(params = parameters):
    for p in params.all():
        np.clip(p.d, -1, 1, out=p.d)

# backward 

In [25]:
def back_prop(target, dh_next, dC_next, C_prev, z, f, i, o, C, C_bar, h, v, y, p=parameters):
    
    assert z.shape == (x_size + h_size, 1)
    assert v.shape == (x_size, 1)
    assert y.shape == (x_size, 1)
    
    for param in [dh_next, dC_next, C_prev, f, i, C_bar, C, o, h]:
        assert param.shape == (h_size, 1)
    
    #delta of v
    dv = np.copy(y)
    dv[target] -= 1
    #update W, b of v
    p.W_v.d += dv @ h.T
    p.b_v.d += dv
    
    #delta of h
    dh = dh_next + p.W_v.T@dv
    
    #delta of o
    do = dh * tanh(C) 
    do = dsigmoid(o) * do
    #update W, b of o
    p.W_o.d += do @ z.T
    p.b_o.d += do
    
    #delta of C
    dC = dC_next + dh*o*dtanh(tanh(C))
   
    #delta of C_bar
    dC_bar = i * dC
    dC_bar = dtanh(C_bar) * dC_bar
    #update W, b of C_bar
    p.W_C.d += dC_bar @ z.T
    p.b_C.d += dC_bar
    
    #delta of i 
    di = C_bar * dC
    di = dsigmoid(i) * di
    #update W, b of i
    p.W_i.d += di @ z.T
    p.b_i.d += di
    
    #delta of f
    df = dC * C_prev 
    df = dsigmoid(f) * df
    #update W, b of f
    p.W_f.d += df @ z.T
    p.b_f.d += df
    
    #delta of z
    dz = p.W_f.v.T@df + p.W_C.v.T@dC_bar +\
         p.W_i.v.T@di + p.W_o.v.T@do
    
    dh_prev = dz[:h_size,:]
    dC_prev = f * dC
    
    return dh_prev, dC_prev 

In [None]:
def forward_backward(inputs, targets, h_prev, C_prev):
    global parameters
    #store the layers in each time step
    #x, z, f, i, o, C, C_bar, h, v, y
    
    x_layer = {}
    z_layer = {}
    f_layer = {}
    i_layer = {}
    o_layer = {}
    C_layer = {}
    C_bar_layer = {}
    h_layer = {}
    v_layer = {}
    y_layer = {}
    
    #set previous C, h
    C_layer[-1] = np.copy(C_prev)
    h_layer[-1] = np.copy(h_prev)
    
    loss = 0 
    
    assert len(inputs) = T_steps 
    
    for t in range(T_steps):
        #one hot 
        x_layer[t] = np.zeros((x_size, 1))
        x_layer[t][target[t]] = 1
        
        #save the result of each layer
        (z_layer[t], f_layer[t], i_layer[t], o_layer[t], \
         C_layer[t], C_bar_layer[t], h_layer[t], v_layer[t], y_layer[t])\
            = forward(x_layer[t], h_layer[t], C_layer[t], p=parameters)
        
        #accumulate loss
        loss+= - np.log(y_layer[targets[t], 0]) #0 is for peel out the []
    
    
    clear_gradients()
    
    dh_next = np.zeros_like(h_s[o])
    dC_next = np.zeros_like(C_S[o])
    
    for t in reversed(range(len(inputs))):
        dh_next, dC_next = back_prop(targets[t], dh_next, dC_next, C_layer[t-1], z_layer[t], f_layer[t], \
                           i_layer[t], o_layer[t], C_layer[t], C_bar_layer[t], h_layer[t], v_layer[t], y_layer[t])
        
        
    clip_gradients()
        
    
    return loss, h_layer[len(inputs)-1], C_layer[len(inputs) -1]
    
    
    
    

In [20]:
a= np.array([1,2,3,4,5])

In [21]:
a[3,0]

IndexError: too many indices for array