In [1]:
""" 
attention模型
"""


' \nattention模型\n'

In [2]:
import sys
sys.path.append('..')
from common.layers import Softmax
from common.np import *


numpy


In [3]:
class AttentionWeight:
    """ 根据编码器的输出hs(所有时间序列的隐状态)，求出各个单词的权重a ,使用softmax使得权重都在0-1之间"""

    def __init__(self) -> None:
        self.params = []
        self.grads = []
        self.softmax = Softmax()
        self.cache = None  # 存储中间结果，即hs，和计算a需要用到的初始化矩阵hr

    # h:解码器当前时间步的隐状态：目标是计算h和hs中各个单词的相似度，即权重a,有a和hs加权和得到上下文向量c,加权和即可确定当前词h应该注意到hs中的哪些词
    def forward(self, hs, h):
        N, T, H = hs.shape
        hr = h.reshape(N, 1, H).repeat(T, axis=1)
        t = hs*hr
        s = np.sum(t, axis=2)  # N,T 得到每个时间步的权重,需要正规化
        a = self.softmax.forward(s)

        self.cache = [hs, hr]
        return a  # 得到了权重a (N,T) 针对encoder中的每一个时间步的权重

    def backward(self, da):
        hs, hr = self.cache
        N, T, H = hs.shape

        # 反向传播，反着求即可
        ds = self.softmax.backward(da)
        dt = ds.reshape(N, T, 1).repeat(H, axis=2)
        dhs = dt*hr
        dhr = dt*hs
        dh = np.sum(dhr, axis=1)
        return dhs, dh


In [4]:
class WeightSum:
    """ 计算权重a和hs的加权和 
    得到的是一个时间步的上下文向量c:N,1,H
    """

    def __init__(self) -> None:
        self.params = []
        self.grads = []
        self.cache = None

    def forward(self, hs, a):
        # 计算加权和最好方式就是矩阵dot
        N, T, H = hs.shape
        ar = a.reshape(N, T, 1).repeat(H, axis=2)
        t = hs*ar
        c = np.sum(t, axis=1)  # t.shape:[N,H]
        self.cache = (hs, ar)

        return c

    def backward(self, dc):
        hs, ar = self.cache
        N, T, H = hs.shape

        dt = dc.reshape(N, 1, H).repeat(T, axis=1)
        dar = dt*hs
        dhs = dt*ar
        da = np.sum(da, axis=2)

        return dhs, da


In [5]:
class Attention:
    """ 整合了上两个模块 
    单个时间步的attention
    """

    def __init__(self) -> None:
        self.params, self.grads = [], []
        self.attention_weight_layer = AttentionWeight()
        self.weightsum_layer = WeightSum()
        self.attention_weight = None  # 保存当前的注意力权重

    def forward(self, hs, h):
        a = self.attention_weight_layer.forward(hs, h)
        c = self.weightsum_layer.forward(hs, a)
        self.attention_weight = a
        return c

    def backward(self, dc):
        dhs0, da = self.weightsum_layer.backward(dc)  # 该层正向传播是有a,hs得到c,故反向传播是由dc得到dhs,da
        dhs1, dh = self.attention_weight_layer.backward(da)

        dhs = dhs0+dhs1
        return dhs, dh


In [6]:
class TimeAttention:
    """ 只是组合了多个attention层 
        每个attention各自单独进行反向传播和正向传播
    """

    def __init__(self) -> None:
        self.params, self.grads = [], []
        self.layers = None  # 保存每一个时间步的上下文c
        self.attention_weight = None  # 保存每一个时间步的权重a

    def forward(self, hs_enc, hs_dec):
        """ 
        encoder的hs送到每一个attention 
        对于解码器每一个时间步都生成c
        """
        N, T, H = hs_dec.shape
        out_c = np.empty_like(hs_dec)
        self.layers = []
        self.attention_weight = []
        for t in range(T):
            layer = Attention()
            out_c[:, t, :] = layer.forward(
                hs_enc, hs_dec[:, t, :])  # 得到当前时间步的c
            self.layers.append(layer)
            self.attention_weight.append(layer.attention_weight)
        return out_c

    def backward(self, dout_c):
        # 正向传播是由hs_enc,hs_dec得到out_c,故反向传播是由dout_c得到dhs_enc,dhs_dec
        N, T, H = dout_c.shape
        dhs_enc = 0  # 因为解码器所有层用的都是同一个dhs_enc,故最终的dhs_enc是多有的dhs_enc之和
        dhs_dec = np.empty_like(dout_c)

        for t in range(T):
            layer = self.layers[t]
            dhs, dh = layer.backward(dout_c)
            dhs_enc += dhs
            dhs_dec[:, t, :] = dh
        return dhs_enc, dhs_dec


In [7]:

from common.time_layers import *


In [8]:
class Encoder:
    def __init__(self, vocab_size, wordec_size, hidden_size) -> None:
        V, D, H = vocab_size, wordec_size, hidden_size

        # 初始化参数
        embed_w = (np.random.randn(V, D)/100).astype('f')
        lstm_Wx = (np.random.randn(D, 4*H)/np.sqrt(D)).astype('f')
        lstm_Wh = (np.random.randn(H, 4*H)/np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4*H).astype('f')

        # 编码器的几个层
        self.embed = TimeEmbedding(embed_w)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False)

        # 参数
        self.params = self.embed.params+self.lstm.params
        self.grads = self.embed.grads+self.lstm.grads

        self.hs = None  # 每个时间步的隐状态

    def forward(self, xs):
        xs = self.embed.forward(xs)  # 取出某个序列的词向量
        hs = self.lstm.forward(xs)
        self.hs = hs
        return self.hs[:, -1, :]  # 返回最后一个时间步的隐藏状态

    def backward(self, dh):
        dhs = np.zeros_like(self.hs)
        # 初始状态由解码器传来
        dhs[:, -1, :] = dh
        dout = self.lstm.backward(dhs)
        out = self.embed.backward(dout)
        return out


In [9]:
class Decoder:
    def __init__(self, vocab_size, wordec_size, hidden_size) -> None:
        V, D, H = vocab_size, wordec_size, hidden_size
        # decoder也要学习：故先初始化参数
        embed_w = (np.random.randn(V, D)/100).astype('f')
        lstm_Wx = (np.random.randn(D, 4*H)/np.sqrt(D)).astype('f')
        lstm_Wh = (np.random.randn(H, 4*H)/np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4*H).astype('f')
        affine_w = (np.random.randn(H, V)/np.sqrt(H)).astype('f')
        affine_b = np.zeros(V).astype('f')

        self.embed = TimeEmbedding(embed_w)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False)
        self.affine = TimeAffine(affine_w, affine_b)

        self.params = self.embed.params+self.lstm.params+self.affine.params
        self.grads = self.embed.params+self.lstm.params+self.affine.params

    def forward(self, xs, h):
        self.lstm.set_state(h)
        out = self.embed.forward(xs)
        out = self.lstm.forward(out)
        out = self.affine.forward(out)
        return out

    def backward(self, dscore):
        dout = self.affine.backward(dscore)
        dout = self.lstm.backward(dout)
        dout = self.embed.backward(dout)
        dh = self.lstm.dh
        return dh

    def generate(self, h, start_id, sample_size):
        self.lstm.set_state(h)
        sample = []
        sample_id = start_id  # 当前预测时的输入，把前一个预测结果当作下一次预测的输出
        for _ in range(sample_size):  # 训练数据获得输出
            x = np.array(sample_id).reshape(1, 1)
            out = self.embed.forward(x)
            out = self.lstm.forward(out)
            y = self.affine.forward(out)

            sample_id = np.argmax(y.flatten())
            sample.append(int(sample_id))
        return sample


In [10]:
""" 带attention的seq2seq实现 
与之前encoder不同之处仅在于返回的是全部的隐藏状态hs
decoder不同之处在于添加了attention层：添加的位置可由自己决定
"""


class AttentionEncoder(Encoder):
    def forward(self, xs):
        xs = self.embed.forward(xs)
        hs = self.lstm.forward(xs)
        return hs

    def backward(self, dhs):
        dout = self.lstm.backward(dhs)
        dxs = self.embed.backward(dout)
        return dxs


In [11]:
class AttentionDecoder:  # 不能继承，因为它在中间添加了一个新的层
    def __init__(self, vocab_size, wordec_size, hidden_size) -> None:
        V, D, H = vocab_size, wordec_size, hidden_size
        embed_w = (np.random.randn(V, D)/100).astype('f')
        lstm_Wx = (np.random.randn(D, 4*H)/np.sqrt(D)).astype('f')
        lstm_Wh = (np.random.randn(H, 4*H)/np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4*H).astype('f')
        affine_w = (np.random.randn(2*H, V)/np.sqrt(2*H)).astype('f')
        affine_b = np.zeros(V).astype('f')
        # 初始化参数

        self.embed = TimeEmbedding(embed_w)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        # 添加attention层
        self.attention = TimeAttention()
        self.affine = TimeAffine(affine_w, affine_b)

        layers = [self.embed, self.lstm, self.attention, self.affine]
        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, xs, hs_enc):
        # lstm只需要编码器的最后一个隐藏状态
        h = hs_enc[:, -1]
        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        hs_dec = self.lstm.forward(out)
        c = self.attention.forward(hs_enc, hs_dec)
        # 隐藏向量和上下文向量拼接后一起作为affine的输入
        affine_in = np.concatenate(hs_dec, c, axis=2)  # N,T,H,应该在h维拼接
        score = self.affine.forward(affine_in)
        return score

    def backward(self, dscore):
        daffine_in = self.affine.backward(dscore)
        N, T, H2 = daffine_in.shape
        H  = H2//2 #注意H2是拼接之后的
        dc = daffine_in[:, :, :H]  # 左开右闭
        dhs_dec_0 = daffine_in[:, :, H:]
        dhs_enc, dhs_dec_1 = self.attention.backward(dc)
        dhs_dec = dhs_dec_0+dhs_dec_1
        dout = self.lstm.backward(dhs_dec)
        dxs = self.embed.backward(dout)
        return dhs_enc
    def generate(self,enc_hs,start_id,sample_size):
        self.lstm.set_state(enc_hs)
        sample = []
        sample_id = start_id
        for _ in range(sample_size):
            #取得开始单词的词向量，训练，然后生成
            x = np.array(sample_id).reshape(1,1)
            out = self.embed.forward(x)
            out = self.lstm.forward(out)
            out = self.attention.forward(out)
            score = self.affine.forward(out)

            y = np.argmax(score.flatten())
            sample_id = y #当前的生成做为下一个的输入
            sample.append(int(sample_id))
        return sample

In [12]:
class AttentionSeq2Seq:
    def __init__(self,vocab_size, wordec_size, hidden_size) -> None:
        self.encoder = AttentionEncoder(vocab_size, wordec_size, hidden_size)
        self.decoder = AttentionDecoder(vocab_size, wordec_size, hidden_size)
        #损失函数，计算损失
        self.loss = TimeSoftmaxWithLoss()

        self.params,self.grads = [],[]

        self.params = self.encoder.params+self.decoder.params
        self.grads = self.encoder.grads+self.decoder.params
    def forward(self,xs,ts):
        decoder_xs,decoder_ts = ts[:,:-1],ts[:,1:] #训练时包括所有，判断正误不用_
        enc_hs = self.encoder.forward(xs)
        out = self.decoder.forward(decoder_xs,enc_hs)
        loss = self.loss.forward(out,decoder_ts)
        return loss
    def backward(self,dout=1):
        dout = self.loss.backward(dout)
        denc_hs = self.decoder.backward(dout)
        dxs = self.encoder.backward(denc_hs)
        return dxs
    def generate(self,xs,start_id,sample_size):
        enc_hs = self.encoder.forward(xs)
        sample = self.decoder.generate(enc_hs,start_id,sample_size)
        return sample

In [13]:
import time
from dataset import sequence
from common.optimizer import Adam
from common.util import clip_grads
import matplotlib.pyplot as plt
# from common.trainer import Trainer

In [14]:
class Trainer:
    def __init__(self,model,optimizer) -> None:
        self.model = model
        self.optimizer = optimizer
        self.loss = []
        self.current_epoch=0
        self.eval_interval = None
    def fit(self,train_data,table_data,batch_size,max_epochs,max_grad,eval_interval=20):#迭代多少组时打印当前情况
        data_size = len(train_data)
        max_iters = data_size//batch_size #数据的批次大小
        loss_sum = 0
        loss_count = 0

        start = time.time()
        for epoch in range(max_epochs):
            #打乱数据，然后计算每一个批次的数据
            idx = np.random.permutation(np.arange(data_size))
            x = train_data[idx]
            t = table_data[idx]#已经打乱的数据

            for iters in range(max_iters):
                #取出一个批次
                batch_x = x[batch_size*iters:batch_size*(iters+1)]
                batch_t = t[batch_size*iters:batch_size*(iters+1)]

                #前向传播计算损失、后向传播求梯度更新参数
                loss = self.model.forward(batch_x,batch_t) #model的前向传播结束时已经计算出了损失
                self.model.backward()
                params,grads = self.model.params,self.model.grads

                if max_grad is not None:#梯度裁剪
                    clip_grads(grads,max_grad)
                self.optimizer.update(params,grads)
                loss_sum+=loss
                loss_count+=1
            
                #评价
                if eval_interval is not None and (iters%eval_interval==0):
                    avg_loss = loss_sum/loss_count
                    cost_time = time.time()-start
                    print('| epoch %d| | iters %d/%d | | time %d | | avg_loss %d |'%(epoch,iters,max_iters,cost_time,avg_loss))
                    self.loss.append(float(avg_loss))
                    loss_count,loss_sum=0,0 #一定模块打断，也就是将数据分成了几个单独的模块计算
            self.current_epoch+=1
        def train_plot(self,ylim=None):
            x = np.arange(len(self.loss)) #x轴，self.loss为y轴
            if ylim is not None:
                plt.ylim(ylim)
            plt.plot(x,self.loss,label="train_data")
            plt.xlabel('iterations (x'+str(self.eval_interval)+')')
            plt.show()

In [15]:
(x_train,t_train),(x_test,t_test) = sequence.load_data('date.txt')
char_to_id,id_to_char = sequence.get_vocab()
x_train,x_test = x_train[:,::-1],x_test[:,::-1]#反转
print(len(x_test))

5000


In [16]:
#设定超参数
vocab_size=len(char_to_id)#词表大小
wordec_size = 16
hidden_size = 256
batch_size,epochs = 128,10
max_grad = 5.0 #梯度裁剪

#初始化模型
model = AttentionSeq2Seq(vocab_size, wordec_size, hidden_size)
model.forward(x_train,t_train)
optimizer = Adam()
trainer = Trainer(model,optimizer)

KeyboardInterrupt: 