<a href="https://colab.research.google.com/github/gusya-soc/notebook_collection/blob/main/transformer_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
import numpy as np

In [None]:
import time
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from tensorflow.keras import models,layers

In [None]:
# @tf.keras.utils.register_keras_serializable()
class MultiHeadAttention(layers.Layer):
    def __init__(self,head_num,hidden_size):
        super().__init__()
        self.head_num = head_num
        self.hidden_size = hidden_size
        self.sub_hidden = hidden_size // head_num
        self.dense_q = layers.Dense(hidden_size)
        self.dense_k = layers.Dense(hidden_size)
        self.dense_v = layers.Dense(hidden_size)
        self.dense_output = layers.Dense(hidden_size)

    def scale_dot_product(self,q,k,v,mask):     #q,k,v shape (batch_size,head_num,seq_size_,hidden_size)
        matmul_qk =  tf.matmul(q,k,transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scale_qk = matmul_qk / tf.math.sqrt(dk)

        if mask is not None:
            # print(mask.shape)
            scale_qk += (mask * -1e9)

        attention_weight = tf.nn.softmax(scale_qk)

        output = tf.matmul(attention_weight,v)

        return output

    def splite_heads(self,x):
        self.batch_size = tf.shape(x)[0]
        x = tf.reshape(x,(self.batch_size,-1,self.head_num,self.sub_hidden)) #shape = (batch_size,seq,num_heads,sub_hidden)
        return tf.transpose(x,perm=[0,2,1,3]) # 将head和seq交换位置，使seq-sub_hidden成为被计算的矩阵

    def call(self,q,k,v,mask):
        q = self.dense_q(q)
        k = self.dense_k(k)
        v = self.dense_v(v)

        q = self.splite_heads(q)
        k = self.splite_heads(k)
        v = self.splite_heads(v)

        selfattention = self.scale_dot_product(q,k,v,mask=mask)
        selfattention = tf.transpose(selfattention,perm=[0,2,1,3])

        concated = tf.reshape(selfattention,(self.batch_size,-1,self.hidden_size))

        output = self.dense_output(concated)

        return output




In [None]:
# temp_mha = MultiHeadAttention(head_num=8,hidden_size=512)
# y = tf.random.uniform((1, 60, 512))
# out = temp_mha(y,y,y,mask=None)
# out.shape

In [None]:
# @tf.keras.utils.register_keras_serializable()
class Encoder(layers.Layer):
    def __init__(self,head_num,hidden_size):
        super().__init__()
        self.head_num = head_num
        self.hidden_size = hidden_size
        self.MHA = MultiHeadAttention(head_num,hidden_size)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        self.ffn_1 = layers.Dense(self.hidden_size*4,activation='relu')
        self.ffn_2 = layers.Dense(self.hidden_size)

    def call(self,x,mask):
        res = x
        x = self.MHA(q=x,k=x,v=x,mask=mask)
        x = tf.add(res,x)
        x = self.norm_1(x)
        res = x
        x = self.ffn_1(x)
        x = self.ffn_2(x)
        x = tf.add(res,x)
        x = self.norm_2(x)
        return x

class Decoder(layers.Layer):
    def __init__(self,head_num,hidden_size):
        super().__init__()
        self.head_num = head_num
        self.hidden_size = hidden_size
        self.MHA_1 = MultiHeadAttention(head_num,hidden_size)
        self.MHA_2 = MultiHeadAttention(head_num,hidden_size)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)
        self.ffn_1 = layers.Dense(self.hidden_size*4,activation='relu')
        self.ffn_2 = layers.Dense(self.hidden_size)

    def call(self,x,enc_out,look_ahead_mask,padding_mask):
        res = x
        x = self.MHA_1(x,x,x,mask=look_ahead_mask)
        x = self.norm_1(res+x)
        
        res = x
        x = self.MHA_2(q=x,k=enc_out,v=enc_out,mask=padding_mask)
        x = self.norm_2(res+x)

        res = x
        x = self.ffn_1(x)
        x = self.ffn_2(x)
        x = self.norm_3(res+x)

        return x

        

In [None]:
def positional_encoding(position, d_model):
    def get_angles(pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
        return pos * angle_rates
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],np.arange(d_model)[np.newaxis, :],d_model)

    # 将 sin 应用于数组中的偶数索引（indices）；2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # 将 cos 应用于数组中的奇数索引；2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
class Transformer(models.Model):
    def __init__(self,head_num,hidden_size,layer_num):
        super().__init__()
        self.head_num = head_num
        self.hidden_size = hidden_size
        self.layer_num = layer_num
        self.vocab_size = 9000

        self.enc_layers = [Encoder(head_num=head_num,hidden_size=hidden_size) for _ in range(layer_num)]
        self.dec_layers = [Decoder(head_num=head_num,hidden_size=hidden_size) for _ in range(layer_num)]

        self.inp_emb = layers.Embedding(self.vocab_size,hidden_size)
        self.out_emb = layers.Embedding(self.vocab_size,hidden_size)

        self.position = positional_encoding(self.vocab_size,hidden_size)

        self.dense = layers.Dense(self.vocab_size,activation='softmax')
    def call(self,inputs):
        enc_inp,dec_inp,enc_padding_mask,dec_padding_mask,look_ahead_mask = inputs
        
        enc_seq_len = tf.shape(enc_inp)[1]
        dec_seq_len = tf.shape(dec_inp)[1]

        ## encoder layers
        x_e = self.inp_emb(enc_inp)
        x_e += self.position[:,:enc_seq_len,:]
        for i in range(self.layer_num):
            x_e = self.enc_layers[i](x_e,enc_padding_mask)
        
        ## decoder layers
        x_d = self.out_emb(dec_inp)
        x_d += self.position[:,:dec_seq_len,:]
        for i in range(self.layer_num):
            x_d = self.dec_layers[i](x_d,x_e,look_ahead_mask,dec_padding_mask)
        
        ## output layer
        output = self.dense(x_d)
        return output

In [None]:
#test
tmp = Transformer(head_num=8,hidden_size=512,layer_num=4)
# temp_input = tf.random.uniform((64, 26))
temp_target = tf.random.uniform((64, 26))

In [None]:
# x = tmp((temp_input,temp_target,None,None,None))

In [None]:
# x.shape

In [None]:
loss_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=False,reduction='none')
def custom_loss(real,pred):

    
    _loss = loss_obj(real,pred)
    mask = tf.math.logical_not(tf.math.equal(real,0))
    mask = tf.cast(mask,dtype=_loss.dtype)
    # print(mask)
    _loss *= mask
    _loss = tf.reduce_mean(_loss)
    return _loss

In [None]:
# custom_loss(temp_target,x)

In [None]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']

In [None]:
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for pt, en in train_examples), target_vocab_size=2**13)

tokenizer_pt = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, en in train_examples), target_vocab_size=2**13)

In [None]:
## !! 大小写敏感，应在之前执行全小写转换
sample_string = 'the people is awesome.'

tokenized_string = tokenizer_en.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer_en.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

assert original_string == sample_string

for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer_en.decode([ts])))

Tokenized string is [3, 57, 13, 2799, 7877]
The original string: the people is awesome.
3 ----> the 
57 ----> people 
13 ----> is 
2799 ----> awesome
7877 ----> .


In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 32

In [None]:
def encode(lang1, lang2):
  ## 字典长度作为起始，字典长度+1作为结束的编码。其编码不与字符编码冲突
  lang1 = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(
      lang1.numpy()) + [tokenizer_pt.vocab_size+1]

  lang2 = [tokenizer_en.vocab_size] + tokenizer_en.encode(
      lang2.numpy()) + [tokenizer_en.vocab_size+1]

  return lang1, lang2

In [None]:
MAX_LENGTH = 600
def filter_max_length(x, y, max_length=MAX_LENGTH):
  return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [None]:
def tf_encode(pt, en):
  ## !! 为什么要包装成两次函数调用
  result_pt, result_en = tf.py_function(encode, [pt, en], [tf.int64, tf.int64])
  result_pt.set_shape([None])
  result_en.set_shape([None])

  return result_pt, result_en

In [None]:
train_dataset = train_examples.map(tf_encode)
# train_dataset = train_dataset.filter(filter_max_length)
# 将数据集缓存到内存中以加快读取速度。
# train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE) #乱序，以及batch化
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)


val_dataset = val_examples.map(tf_encode)
val_dataset = val_dataset.padded_batch(BATCH_SIZE)

In [None]:
class Mask():
    def create_padding_mask(self,seq):
        seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

        # 添加额外的维度来将填充加到
        # 注意力对数（logits）。
        return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)
    
    def create_look_ahead_mask(self,size):
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)  # tf.linalg.band_part(input,numlower,upper) 表明保留多少级次下对角线与次上对角线。-1代表全保留。因此全一矩阵保留下界再取反，则刚好为前瞻遮挡形状
        # print(tf.linalg.band_part(tf.ones((size, size)), -1, 0))
        return mask  # (seq_len, seq_len)


    def create_masks(self,inp,tar):
        enc_padding_mask = self.create_padding_mask(inp)

        # 在解码器的第二个注意力模块使用。
        # 该填充遮挡用于遮挡编码器的输出。
        dec_padding_mask = self.create_padding_mask(inp)

        # 在解码器的第一个注意力模块使用。
        # 用于填充（pad）和遮挡（mask）解码器获取到的输入的后续标记（future tokens）。
        look_ahead_mask = self.create_look_ahead_mask(tf.shape(tar)[1])
        dec_target_padding_mask = self.create_padding_mask(tar)
        combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
        # print('enc_padding_mask{}\n combined_mask{}\n, dec_padding_mask{}'.format(enc_padding_mask.shape, combined_mask.shape, dec_padding_mask.shape))
        return enc_padding_mask, combined_mask, dec_padding_mask
    def __call__(self,inp,tar):
        return self.create_masks(inp,tar)

In [None]:
def data_generrator(dataset):
    mask = Mask()
    for pt,en in dataset.repeat():
        inp = pt
        tar_inp = en[:,:-1]
        tar_rel = en[:,1:]
        enc_padding_mask,combined_mask,dec_padding_mask = mask(inp,tar_inp)
        yield ([inp,tar_inp,enc_padding_mask,dec_padding_mask,combined_mask],tar_rel)

In [None]:
gen = data_generrator(train_dataset)
val_gen = data_generrator(val_dataset)

In [None]:
per_epoch_step = len(train_dataset)

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
learning_rate = CustomSchedule(512)

opt = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [None]:
learning_rate(10.)

<tf.Tensor: shape=(), dtype=float32, numpy=1.7469279e-06>

In [None]:
model = Transformer(head_num=8,hidden_size=512,layer_num=5)
model.compile(loss=custom_loss,optimizer='adam',metrics='acc')
# model.fit((gen),epochs=20,steps_per_epoch=per_epoch_step)

In [None]:
model.fit((gen),epochs=1,steps_per_epoch=per_epoch_step)



<keras.callbacks.History at 0x1c878d1ac70>

In [None]:
model.save('./transfromer/')



INFO:tensorflow:Assets written to: ./transfromer/assets


INFO:tensorflow:Assets written to: ./transfromer/assets


In [None]:
model.summary()

Model: "transformer_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_4 (Encoder)         multiple                  3152384   
                                                                 
 encoder_5 (Encoder)         multiple                  3152384   
                                                                 
 encoder_6 (Encoder)         multiple                  3152384   
                                                                 
 encoder_7 (Encoder)         multiple                  3152384   
                                                                 
 encoder_8 (Encoder)         multiple                  3152384   
                                                                 
 decoder_4 (Decoder)         multiple                  4204032   
                                                                 
 decoder_5 (Decoder)         multiple                

In [None]:
weight_path = "transformer_2.weight"
model.load_weights(weight_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1c6fb760cd0>

In [None]:
# model.compile(loss=custom_loss,optimizer=opt,metrics='acc')
model.fit((gen),epochs=1,steps_per_epoch=per_epoch_step)



<keras.callbacks.History at 0x1c892c05580>

In [None]:
model.compile("rmsprop",loss="sparse_categorical_crossentropy",metrics=["accuracy"])

In [None]:
model.fit((gen),epochs=50,steps_per_epoch=per_epoch_step)

In [None]:
model.fit((gen),epochs=1,steps_per_epoch=per_epoch_step)



<keras.callbacks.History at 0x1c87e20dc40>

In [None]:
filepath = "./transformer_2.weight"
cp = keras.callbacks.ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, save_weights_only=True, mode='auto')

In [None]:
model.fit((gen),epochs=50,steps_per_epoch=per_epoch_step,callbacks=[cp])

Epoch 1/50
Epoch 00001: loss did not improve from 1.10299
Epoch 2/50
Epoch 00002: loss improved from 1.10299 to 1.09892, saving model to .\transformer_2.weight
Epoch 3/50
Epoch 00003: loss did not improve from 1.09892
Epoch 4/50
Epoch 00004: loss did not improve from 1.09892
Epoch 5/50
Epoch 00005: loss did not improve from 1.09892
Epoch 6/50
Epoch 00006: loss did not improve from 1.09892
Epoch 7/50
Epoch 00007: loss did not improve from 1.09892
Epoch 8/50
Epoch 00008: loss improved from 1.09892 to 1.09584, saving model to .\transformer_2.weight
Epoch 9/50
Epoch 00009: loss improved from 1.09584 to 1.08255, saving model to .\transformer_2.weight
Epoch 10/50
Epoch 00010: loss did not improve from 1.08255
Epoch 11/50
Epoch 00011: loss did not improve from 1.08255
Epoch 12/50
Epoch 00012: loss did not improve from 1.08255
Epoch 13/50
Epoch 00013: loss did not improve from 1.08255
Epoch 14/50

In [None]:
print(1111)


1111


In [None]:
model.fit((gen),epochs=50,steps_per_epoch=per_epoch_step,callbacks=[cp])

In [None]:
model.fit((gen),epochs=500,steps_per_epoch=per_epoch_step,callbacks=[cp])