In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


# 定义位置编码
def get_positional_encoding(max_length, d_model):
    positions = np.arange(max_length)[:, np.newaxis]
    div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
    pe = np.zeros((max_length, d_model))
    pe[:, 0::2] = np.sin(positions * div_term)
    pe[:, 1::2] = np.cos(positions * div_term)
    return tf.constant(pe, dtype=tf.float32)


# 定义多头自注意力层
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // num_heads

        self.wq = layers.Dense(d_model)  # Query
        self.wk = layers.Dense(d_model)  # Key
        self.wv = layers.Dense(d_model)  # Value

        self.dense = layers.Dense(d_model)  # Output layer

    def split_heads(self, x):
        # 将最后一维分成多个头部
        batch_size = tf.shape(x)[0]
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])  # (batch_size, num_heads, seq_length, depth)

    def call(self, inputs):
        q = self.split_heads(self.wq(inputs))  # Query
        k = self.split_heads(self.wk(inputs))  # Key
        v = self.split_heads(self.wv(inputs))  # Value

        # 计算注意力分数
        scaled_attention_logits = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.depth, tf.float32))
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

        # 应用注意力权重
        output = tf.matmul(attention_weights, v)
        output = tf.transpose(output, perm=[0, 2, 1, 3])  # (batch_size, seq_length, num_heads, depth)

        # 连接所有头
        output = tf.reshape(output, (tf.shape(output)[0], -1, self.d_model))  # (batch_size, seq_length, d_model)
        return self.dense(output)


# 定义GPT模型
class GPT(keras.Model):
    def __init__(self, vocab_size, max_length, d_model, num_heads, num_layers, dropout_rate):
        super(GPT, self).__init__()
        self.embedding = layers.Embedding(input_dim=vocab_size, output_dim=d_model)
        self.positional_encoding = get_positional_encoding(max_length, d_model)
        self.attention_layers = [MultiHeadSelfAttention(d_model, num_heads) for _ in range(num_layers)]
        self.dense = layers.Dense(vocab_size)

    def call(self, inputs, training):
        x = self.embedding(inputs) + self.positional_encoding[:tf.shape(inputs)[1], :]
        for attention_layer in self.attention_layers:
            x = attention_layer(x)
        return self.dense(x)


# 设置超参数
vocab_size = 5000  # 词汇表大小
max_length = 50  # 最大序列长度
d_model = 128  # 嵌入维度
num_heads = 4  # 注意力头数
num_layers = 4  # Transformer层数
dropout_rate = 0.1  # dropout比例

# 构建模型
gpt_model = GPT(vocab_size, max_length, d_model, num_heads, num_layers, dropout_rate)

# 编译模型
gpt_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy')

# 示例数据
# 假设有一个训练数据集，输入是形状为(batch_size, max_length)的整数序列
# 这里生成一些随机数据作为示例
x_train = np.random.randint(0, vocab_size, (1000, max_length))  # 1000个样本
y_train = np.random.randint(0, vocab_size, (1000, max_length))  # 目标数据

# 训练模型
gpt_model.fit(x_train, y_train, epochs=5, batch_size=32)


# 示例：生成文本
def generate_text(model, start_token, max_length, num_tokens):
    generated = [start_token]
    for _ in range(num_tokens):
        input_sequence = np.array(generated)[np.newaxis, :]
        predictions = model.predict(input_sequence)
        next_token = np.argmax(predictions[0, -1, :])  # 获取最后一个token的预测
        generated.append(next_token)
    return generated


# 生成文本示例
start_token = 1  # 假设1是开始token
generated_sequence = generate_text(gpt_model, start_token, max_length, num_tokens=10)
print("Generated Sequence:", generated_sequence)


Epoch 1/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 105ms/step - loss: 15.9006
Epoch 2/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 110ms/step - loss: 12.7330
Epoch 3/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 111ms/step - loss: 14.2599
Epoch 4/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 112ms/step - loss: 14.3239
Epoch 5/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 113ms/step - loss: 14.3817
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms