# Assignment
目前我们可以有
## RNN Class
RNNs 很容易实现，接受一个$x$ vector作为输入并返回一个$y$ vector。 只不过输出的内容不仅仅与当前的输入有关，还与过去的输入是相关的。那么我们可以定义一个RNN的class，通过以下调用方式来实现一次迭代：

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from IPython.core.interactiveshell import InteractiveShell
from sklearn.preprocessing import OneHotEncoder
InteractiveShell.ast_node_interactivity = "all"

每调用一次`step`， state向量 $h$ 就会被更新一次， 请同学们根据课上所讲内容，完成RNN的定义，并构建一个多层RNN

In [3]:
class RNN:
    def __init__(self,input_dim,hidden_dim,output_dim):
        self.hs = {}
        self.input_dim = input_dim  #输入层维度
        self.hidden_dim = hidden_dim  #隐层维度
        self.output_dim = output_dim  #输出层维度
        self.hprev = np.zeros((hidden_dim,1))
        self.hs[-1] = self.hprev  #h初始值
        
    def print_RNN(self):
        print("输入层维度：{0}，隐层维度：{1}，输出层维度：{2}\n".format(self.input_dim,self.hidden_dim,self.output_dim))
        
    def step(self,t,x):
        self.Whx = np.random.randn(self.hidden_dim, self.input_dim)# input to hidden
        self.Whh = np.random.randn(self.hidden_dim, self.hidden_dim) # hidden to hidden
        self.Why = np.random.randn(self.output_dim, self.hidden_dim) # hidden to output
        self.bh = np.zeros((self.hidden_dim, 1)) # hidden bias
        self.by = np.zeros((self.output_dim, 1)) # output bias
        # update the hidden state
        self.hs[t] =  np.tanh(np.dot(self.Whh,self.hs[t-1]) + self.bh + np.dot(self.Whx,x))
        # compute the output vectors
        y = np.dot(self.Why,self.hs[t]) + self.by
        return y,self.hs

In [4]:
# Going deep
#以课件Character-Level Language Models，输入的是["h","e","l","l","o"]的one_hot编码，timestep为5，查看每一层的输出

inputs = np.array([['h'],['e'],['l'],['l'],['o']])
cat_encoder = OneHotEncoder(sparse=False)#稀疏
inputs_1hot= cat_encoder.fit_transform(inputs)
print("输入层one_hot编码：{0}".format(inputs_1hot))
inputs_1hot = list(map(lambda x:x.reshape(4,1),inputs_1hot)) #每个one_hot编码转置操作
input_dim = inputs_1hot[0].shape[0]  #输入层维度
out_dim = input_dim  #输出层维度
hidden_dim = 3  #隐层维度
rnn = RNN(input_dim,hidden_dim,out_dim)
rnn.print_RNN()

#xt is an input vector, y is the RNN's output vector
for t,xt in enumerate(inputs_1hot):
    y,hs = rnn.step(t,xt)
    print("第{0}层输出拉平后:{1}\n,hs:{2}\n".format(t,y.ravel(),hs))
    print('****************************************************')


输入层one_hot编码：[[0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]
输入层维度：4，隐层维度：3，输出层维度：4

第0层输出拉平后:[ 1.13392716  0.3900951  -0.03377654 -0.56116103]
,hs:{-1: array([[0.],
       [0.],
       [0.]]), 0: array([[ 0.97468086],
       [-0.53117275],
       [-0.80720381]])}

****************************************************
第1层输出拉平后:[ 1.36769746  1.47565185  1.25032552 -0.76471354]
,hs:{-1: array([[0.],
       [0.],
       [0.]]), 0: array([[ 0.97468086],
       [-0.53117275],
       [-0.80720381]]), 1: array([[-0.87020032],
       [-0.28302146],
       [-0.92763278]])}

****************************************************
第2层输出拉平后:[ 0.29863054  3.01268666 -0.88876414 -3.99118555]
,hs:{-1: array([[0.],
       [0.],
       [0.]]), 0: array([[ 0.97468086],
       [-0.53117275],
       [-0.80720381]]), 1: array([[-0.87020032],
       [-0.28302146],
       [-0.92763278]]), 2: array([[ 0.90682082],
       [ 0.77305373],
       [-0.97965883]])}

*************************

## 结合课堂代码，自己实现一个character-level 的RNN model

In [23]:
class RnnModel(nn.Module):
    def __init__(self,input_size,hidden_size,output_size,layer_num):
        super().__init__()
        self.rnnLayer = nn.RNN(input_size,hidden_size,layer_num,batch_first=True)
        self.fc = nn.Linear(hidden_size,output_size)
        
    def forward(self,x):
        out,_ = self.rnnLayer(x)
        out = self.fc(out)
        return out


In [24]:
#自定义数据集：输入数据集包含2个样本，一个是hello，另外一个是elloh
inputs_array = np.array([[['h'],['e'],['l'],['l'],['o']],
                        [['e'],['l'],['l'],['o'],['h']]]) #target 就是 elloh和llohe
inputs_list = []#转换成one_hot编码形式
target_list = []
for x in inputs_array:
    cat_encoder = OneHotEncoder(sparse=False)#稀疏
    inputs_1hot= cat_encoder.fit_transform(x)
    inputs_list.append(inputs_1hot)
    target_order = [1,2,3,4,0]
    target_1hot = torch.FloatTensor(inputs_1hot[target_order])
    _,target = torch.max(target_1hot, 1) 
    target_list.append(target)
print("输入层one_hot编码：{0}".format(inputs_list))
print("标签：{0}".format(target_list))


输入层one_hot编码：[array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]]), array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]])]
标签：[tensor([0, 2, 2, 3, 1]), tensor([2, 2, 3, 1, 0])]


In [27]:
batch_size = 1 
input_size= 4 #输入数据维度
hidden_size = 3 #h向量维度
output_size = 4 #输出数据维度
layer_num = 1 #RNN层数
seq_len = 5 #序列数
rnn = RnnModel(4,3,4,1)  #input_size= 4,hidden_size = 3,output_size = 4,layer_num = 1
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(),lr = 1e-1)
rnn

RnnModel(
  (rnnLayer): RNN(4, 3, batch_first=True)
  (fc): Linear(in_features=3, out_features=4, bias=True)
)

In [28]:
epoch_num = 20
for epoch in range(epoch_num):
    print('Epoch {}/{}'.format(epoch+1, epoch_num))
    running_loss = 0.0
    running_acc =0.0
    for i,x_input in enumerate(inputs_list):
        optimizer.zero_grad()
        inputs = torch.FloatTensor(x_input).reshape(1,5,4) # batch_size = 1 ,seq_len = 5,input_size= 4
        target = target_list[i]
        output = rnn(inputs)
        output = output.reshape(5,4) #去掉batch_size这个维度 这样和target的维度可以匹配上
        _,preds = torch.max(output,1)
        running_acc += torch.sum(preds == target).double()/target.size()[0]
        loss = criterion(output,target)
        running_loss += loss.item()
        print("第{0}个样本预测值:{1}".format(i+1,preds))
        print("第{0}个样本标签值:{1}".format(i+1,target))
        loss.backward()
        optimizer.step()
    epoch_loss = running_loss/len(inputs_list)
    epoch_acc = running_acc/len(inputs_list)
    print("Loss:{:.4f},Acc:{:.4f}".format(epoch_loss,epoch_acc))
    print("***************************************")
    
   

Epoch 1/20
第1个样本预测值:tensor([2, 2, 1, 1, 2])
第1个样本标签值:tensor([0, 2, 2, 3, 1])
第2个样本预测值:tensor([2, 2, 2, 2, 2])
第2个样本标签值:tensor([2, 2, 3, 1, 0])
Loss:1.3762,Acc:0.3000
***************************************
Epoch 2/20
第1个样本预测值:tensor([2, 2, 2, 2, 2])
第1个样本标签值:tensor([0, 2, 2, 3, 1])
第2个样本预测值:tensor([2, 2, 2, 2, 2])
第2个样本标签值:tensor([2, 2, 3, 1, 0])
Loss:1.2341,Acc:0.4000
***************************************
Epoch 3/20
第1个样本预测值:tensor([0, 2, 2, 2, 2])
第1个样本标签值:tensor([0, 2, 2, 3, 1])
第2个样本预测值:tensor([2, 2, 2, 2, 0])
第2个样本标签值:tensor([2, 2, 3, 1, 0])
Loss:1.0592,Acc:0.6000
***************************************
Epoch 4/20
第1个样本预测值:tensor([0, 2, 2, 2, 1])
第1个样本标签值:tensor([0, 2, 2, 3, 1])
第2个样本预测值:tensor([2, 2, 2, 1, 0])
第2个样本标签值:tensor([2, 2, 3, 1, 0])
Loss:0.8499,Acc:0.8000
***************************************
Epoch 5/20
第1个样本预测值:tensor([0, 2, 2, 2, 1])
第1个样本标签值:tensor([0, 2, 2, 3, 1])
第2个样本预测值:tensor([2, 2, 2, 1, 0])
第2个样本标签值:tensor([2, 2, 3, 1, 0])
Loss:0.6241,Acc:0.8000
**********

## LSTM Class （Optional）
自定义一个LSTM网络并进行训练， 对比simple RNN的效果

In [14]:
class Lstm(nn.Module):
    def __init__(self,input_size,hidden_size,output_size,layer_num):
        super().__init__()
        self.lstmlayer = nn.LSTM(input_size,hidden_size,layer_num,batch_first=True)
        self.fc = nn.Linear(hidden_size,output_size)
        
    def forward(self,x):
        out,_ = self.lstmlayer(x)
        batch,seq,hidden = out.size()
        out = out.view(seq*batch,hidden)
        out = self.fc(out)
        return out
    
    

In [10]:
#自定义数据集：输入数据集包含2个样本，一个是hello，另外一个是elloh
inputs_array = np.array([[['h'],['e'],['l'],['l'],['o']],
                        [['e'],['l'],['l'],['o'],['h']]]) #target 就是 elloh和llohe
inputs_list = []#转换成one_hot编码形式
target_list = []
for x in inputs_array:
    cat_encoder = OneHotEncoder(sparse=False)#稀疏
    inputs_1hot= cat_encoder.fit_transform(x)
    inputs_list.append(inputs_1hot)
    target_order = [1,2,3,4,0]
    target_1hot = torch.FloatTensor(inputs_1hot[target_order])
    _,target = torch.max(target_1hot, 1) 
    target_list.append(target)
print("输入层one_hot编码：{0}".format(inputs_list))
print("标签：{0}".format(target_list))

输入层one_hot编码：[array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]]), array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]])]
标签：[tensor([0, 2, 2, 3, 1]), tensor([2, 2, 3, 1, 0])]


In [29]:
batch_size = 1 
input_size= 4 #输入数据维度
hidden_size = 3 #h向量维度
output_size = 4 #输出数据维度
layer_num = 1 #RNN层数
seq_len = 5 #序列数
lstm = Lstm(4,3,4,1)  #input_size= 4,hidden_size = 3,output_size = 4,layer_num = 1
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm.parameters(),lr = 1e-1)
lstm

Lstm(
  (lstmlayer): LSTM(4, 3, batch_first=True)
  (fc): Linear(in_features=3, out_features=4, bias=True)
)

In [30]:
epoch_num = 20
for epoch in range(epoch_num):
    print('Epoch {}/{}'.format(epoch+1, epoch_num))
    running_loss = 0.0
    running_acc =0.0
    for i,x_input in enumerate(inputs_list):
        optimizer.zero_grad()
        inputs = torch.FloatTensor(x_input).reshape(1,5,4) # batch_size = 1 ,seq_len = 5,input_size= 4
        target = target_list[i]
        output = lstm(inputs)
        output = output.reshape(5,4) #去掉batch_size这个维度 这样和target的维度可以匹配上
        _,preds = torch.max(output,1)
        running_acc += torch.sum(preds == target).double()/target.size()[0]
        loss = criterion(output,target)
        running_loss += loss.item()
        print("第{0}个样本预测值:{1}".format(i+1,preds))
        print("第{0}个样本标签值:{1}".format(i+1,target))
        loss.backward()
        optimizer.step()
    epoch_loss = running_loss/len(inputs_list)
    epoch_acc = running_acc/len(inputs_list)
    print("Loss:{:.4f},Acc:{:.4f}".format(epoch_loss,epoch_acc))
    print("***************************************")

Epoch 1/20
第1个样本预测值:tensor([0, 0, 0, 0, 0])
第1个样本标签值:tensor([0, 2, 2, 3, 1])
第2个样本预测值:tensor([2, 2, 2, 2, 0])
第2个样本标签值:tensor([2, 2, 3, 1, 0])
Loss:1.3726,Acc:0.4000
***************************************
Epoch 2/20
第1个样本预测值:tensor([2, 2, 2, 2, 2])
第1个样本标签值:tensor([0, 2, 2, 3, 1])
第2个样本预测值:tensor([2, 2, 2, 2, 0])
第2个样本标签值:tensor([2, 2, 3, 1, 0])
Loss:1.2297,Acc:0.5000
***************************************
Epoch 3/20
第1个样本预测值:tensor([2, 2, 2, 2, 2])
第1个样本标签值:tensor([0, 2, 2, 3, 1])
第2个样本预测值:tensor([2, 2, 2, 2, 0])
第2个样本标签值:tensor([2, 2, 3, 1, 0])
Loss:1.0553,Acc:0.5000
***************************************
Epoch 4/20
第1个样本预测值:tensor([1, 2, 2, 2, 1])
第1个样本标签值:tensor([0, 2, 2, 3, 1])
第2个样本预测值:tensor([2, 2, 2, 1, 1])
第2个样本标签值:tensor([2, 2, 3, 1, 0])
Loss:0.8720,Acc:0.6000
***************************************
Epoch 5/20
第1个样本预测值:tensor([1, 2, 2, 2, 1])
第1个样本标签值:tensor([0, 2, 2, 3, 1])
第2个样本预测值:tensor([2, 2, 3, 1, 1])
第2个样本标签值:tensor([2, 2, 3, 1, 0])
Loss:0.7381,Acc:0.7000
**********

简单LSTM和简单RNN对比后，RNN的loss下降更快，最后一个loss最小,LSTM优势并没有体现出来。
原因应该是输入的字母序列太短，LSTM擅长长期记忆的优势体现不出来。