In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data = open('../data/rnn/shakespear.txt', 'r').read()

In [3]:
data_unique_chars = list(set(data))

In [4]:
data_size, X_size = len(data), len(data_unique_chars) # one-hot编码长度和数据大小

In [5]:
# 字典转换char到相应的索引
char_to_idx = {ch:i for i,ch in enumerate(data_unique_chars)}

In [6]:
# 字典转换索引到相应的char
idx_to_char = {i:ch for i,ch in enumerate(data_unique_chars)}

In [7]:
def sigmoid(x):
    sigmoid_out = 1/(1+np.exp(-x))
    return sigmoid_out

In [8]:
def dsigmoid(y):
    # 使用输出值
    dsigmoid_out = y*(1-y)
    
    return dsigmoid_out

In [9]:
def tanh(x):
    tanh_out = np.tanh(x)
    return tanh_out

In [10]:
def dtanh(y):
    # 使用输出值
    dtanh_out = 1-y*y
    return dtanh_out

In [11]:
def softmax(x):
    x_exp = np.exp(x)
    x_exp_sum = np.sum(x_exp)
    return x_exp/x_exp_sum

In [12]:
weight_std = 0.1 # 权重初始化标准差
H_size = 100 # 隐藏层大小
z_size = H_size+X_size # 隐藏层和数据层concatenate大小
T_steps = 25 # 训练数据长度

In [13]:
# LSTM每一个单独参数
class Param:
    def __init__(self, name, value):
        self.value = value # 参数值
        self.name = name # 参数名
        self.dvalue = np.zeros_like(self.value) # 参数梯度

In [14]:
class Parameters:
    def __init__(self):
        # sigmoid函数正态分布初始化(0.5,weight_std)，tanh函数正态分布初始化(0,weight_std)
        self.W_f = Param('W_f', np.random.randn(H_size, z_size)*weight_std+0.5)
        self.b_f = Param('b_f', np.zeros((H_size, 1)))
        
        self.W_i = Param('W_i', np.random.randn(H_size, z_size)*weight_std+0.5)
        self.b_i = Param('b_i', np.zeros((H_size, 1)))
        
        self.W_C = Param('W_C', np.random.randn(H_size, z_size)*weight_std)
        self.b_C = Param('b_C', np.zeros((H_size, 1)))
        
        self.W_o = Param('W_o', np.random.randn(H_size, z_size)*weight_std+0.5)
        self.b_o = Param('b_o', np.zeros((H_size, 1)))
        
        # 输出层
        self.W_v = Param('W_v', np.random.randn(X_size, H_size)*weight_std)
        self.b_v = Param('b_v', np.zeros((X_size, 1)))
        
    def all(self):
        return [self.W_f, self.b_f, self.W_i, self.b_i, self.W_C, self.b_C, self.W_o, self.b_o, self.W_v, self.b_v]
        

In [15]:
parameters = Parameters()

In [16]:
def forward(x, h_prev, C_prev, p=parameters):
    assert x.shape == (X_size, 1)
    assert h_prev.shape == (H_size, 1)
    assert C_prev.shape == (H_size, 1)
    
    z = np.row_stack((h_prev, x))
    f = sigmoid(np.dot(p.W_f.value, z)+p.b_f.value)
    i = sigmoid(np.dot(p.W_i.value, z)+p.b_i.value)
    C_bar = tanh(np.dot(p.W_C.value, z)+p.b_C.value)
    
    C = f*C_prev + i*C_bar
    o = sigmoid(np.dot(p.W_o.value, z)+p.b_o.value)
    h = o*tanh(C)
    
    v = np.dot(p.W_v.value, h)+p.b_v.value
    y_bar = softmax(v)
    
    return z, f, i, C_bar, C, o, h, v, y_bar

In [17]:
input_data = ([char_to_idx[ch] for ch in data[0:T_steps]])
target_data = ([char_to_idx[ch] for ch in data[1:T_steps+1]])

In [18]:
reduce_data_len = 1
# h = np.zeros((H_size, 1))
# C = np.zeros((H_size, 1))
h_data = np.zeros((T_steps+reduce_data_len, H_size, 1))
C_data = np.zeros((T_steps+reduce_data_len, H_size, 1))
theta_h_data = np.zeros((T_steps+reduce_data_len, H_size, 1))
theta_C_data = np.zeros((T_steps+reduce_data_len, H_size, 1))
z_data = np.zeros((T_steps, z_size, 1))
f_data = np.zeros((T_steps, H_size, 1))
i_data = np.zeros((T_steps, H_size, 1))
C_bar_data = np.zeros((T_steps, H_size, 1))
o_data = np.zeros((T_steps, H_size, 1))
v_data = np.zeros((T_steps, X_size, 1))
y_bar_data = np.zeros((T_steps, X_size, 1))

In [19]:
def backward(theta_h_data, theta_C_data, target_data, h_data, C_data, z_data, f_data, i_data, C_bar_data, o_data, v_data, y_bar_data, p=parameters):
    # 学习率
    lr = 0.0001
    # 计算最后的误差
    
    y_T = target_data[T_steps-1]
    y_one_hot_T = np.zeros((X_size, 1))
    y_one_hot_T[y_T] = 1
    error_T = y_one_hot_T-y_bar_data[T_steps-1]

    theta_h_T = np.dot(np.transpose(parameters.W_v.value), error_T)
    theta_C_T = theta_h_T * o_data[T_steps-1] * (1-np.power(np.tanh(C_data[reduce_data_len+T_steps-1]), 2))
    theta_h_data[reduce_data_len+T_steps-1] = theta_h_T
    theta_C_data[reduce_data_len+T_steps-1] = theta_C_T
    # print(theta_h_T)
    # 反向传播计算C和h
    for t in range(T_steps-2, -1, -1):
        y_t = target_data[t]
        y_one_hot_t = np.zeros((X_size, 1))
        y_one_hot_t[y_t] = 1
        error_t = y_one_hot_t-y_bar_data[t]
        theta_h_data[reduce_data_len+t] = np.dot(np.transpose(p.W_v.value), error_t)
        theta_C_data[reduce_data_len+t] = theta_h_data[reduce_data_len+t] * o_data[t] * (1- np.power(np.tanh(C_data[reduce_data_len+t]), 2))+f_data[t+1]*theta_C_data[reduce_data_len+t+1]
#         print('theta_C_data.shape:', theta_C_data.shape)
    # 计算W和b的梯度
    for t in range(1, T_steps, 1):
        y_t = target_data[t]
        y_one_hot_t = np.zeros((X_size, 1))
        y_one_hot_t[y_t] = 1
        error_t = y_one_hot_t-y_bar_data[t]
        W_v_gradient_t = np.dot(error_t, np.transpose(h_data[reduce_data_len+t]))
        b_v_gradient_t = error_t
        p.W_v.value -= lr * W_v_gradient_t
        p.b_v.value -= lr * b_v_gradient_t
        
        W_o_gradient_t = np.dot(theta_h_data[reduce_data_len+t]*np.tanh(C_data[reduce_data_len+t])*o_data[t]*(1-o_data[t]), np.transpose(z_data[t]))
        b_o_gradient_t = theta_h_data[reduce_data_len+t]*np.tanh(C_data[reduce_data_len+t])*o_data[t]*(1-o_data[t])
        p.W_o.value -= lr * W_o_gradient_t
        p.b_o.value -= lr * b_o_gradient_t
        
        W_f_gradient_t = np.dot(theta_C_data[reduce_data_len+t]*np.tanh(C_data[reduce_data_len+t-1])*f_data[t]*(1-f_data[t]), np.transpose(z_data[t]))
        b_f_gradient_t = theta_C_data[reduce_data_len+t]*np.tanh(C_data[reduce_data_len+t-1])*f_data[t]*(1-f_data[t])
        p.W_f.value -= lr * W_f_gradient_t
        p.b_f.value -= lr * b_f_gradient_t
        
        W_C_gradient_t = np.dot(theta_C_data[reduce_data_len+t]*i_data[t]*(1-np.power(np.tanh(C_bar_data[t]), 2)), np.transpose(z_data[t]))
        b_C_gradient_t = theta_C_data[reduce_data_len+t]*i_data[t]*(1-np.power(np.tanh(C_bar_data[t]), 2))
        p.W_C.value -= lr * W_C_gradient_t
        p.b_C.value -= lr * b_C_gradient_t
        
        W_i_gradient_t = np.dot(theta_C_data[reduce_data_len+t]*i_data[t]*(1-i_data[t])*C_bar_data[t], np.transpose(z_data[t]))
        b_i_gradient_t = theta_C_data[reduce_data_len+t]*i_data[t]*(1-i_data[t])*C_bar_data[t]
        p.W_i.value -= lr * W_i_gradient_t
        p.b_i.value -= lr * b_i_gradient_t

In [20]:
for epoch in range(20):
    # 前向传播
    loss = 0
    for t in range(T_steps):
        x_t = input_data[t]
        x_one_hot_t = np.zeros((X_size, 1))
        x_one_hot_t[x_t] = 1
        y_t = target_data[t]
        y_one_hot_t = np.zeros((X_size, 1))
        y_one_hot_t[y_t] = 1
        z_t, f_t, i_t, C_bar_t, C_t, o_t, h_t, v_t, y_bar_t = forward(x_one_hot_t, h_data[(t+reduce_data_len)-1], C_data[(t+reduce_data_len)-1])
        z_data[t] = z_t
        f_data[t] = f_t
        i_data[t] = i_t
        C_bar_data[t] = C_bar_t
        C_data[(t+reduce_data_len)] = C_t
        o_data[t] = o_t
        h_data[(t+reduce_data_len)] = h_t
        v_data[t] = v_t
        y_bar_data[t] = y_bar_t
        loss += -np.sum(y_one_hot_t*np.log(y_bar_data[t]))
    # 反向传播
    backward(theta_h_data, theta_C_data, target_data, h_data, C_data, z_data, f_data, i_data, C_bar_data, o_data, v_data, y_bar_data, p=parameters)
print('loss:', loss)

('loss:', 103.29016184582763)
