In [1]:
from gpt1 import *

2023-06-12 22:47:47.819801: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [2]:

MAX_LENGTH = 40
cat_name_dic = {
    '100': '民生',
    '101': '文化',
    '102': '娱乐',
    '103': '体育',
    '104': '财经',
    '106': '房产',
    '107': '汽车',
    '108': '教育',
    '109': '科技',
    '110': '军事',
    '112': '旅游',
    '113': '国际',
    '114': '证券',
    '115': '农业',
    '116': '电竞'
}
cat_name_all = list(cat_name_dic.values())
cat_name_label = dict([(cat_name_all[k], k) for k in range(len(cat_name_all))])

def load_data(file_path):
    corpus = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for k in f:
            new_id, cat, cat_n, title, title_kws = k.strip("").split("_!_")
            cat_name = cat_name_dic.get(cat, '')
            if cat_name == '':
                continue
            if len(title) > MAX_LENGTH:
                continue
                
            label = [0 for i in range(len(cat_name_all))]
            index = cat_name_label[cat_name]
            label[index] = 1
            corpus.append([title, label])
    return corpus

corpus = load_data('./toutiao_cat_data.txt')

In [3]:
random.shuffle(corpus)

In [4]:
### 分词
corpus_format = []
for k in corpus:
    title = k[0]
    cat = k[1]
    title = " ".join(jieba.cut(title, cut_all=False))
    corpus_format.append([title, cat])

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.661 seconds.
Prefix dict has been built successfully.


In [5]:
random.shuffle(corpus_format)
train_examples, val_examples = corpus_format[:300000], corpus_format[300000:]
tokenizer_title = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (k[0] for k in train_examples), target_vocab_size=2**13)

In [6]:
tokenizer_title.vocab_size

8273

In [7]:
sample_str = '为什么 商用 客机 一般 先要 卖 给 银行'
tokenized_str = tokenizer_title.encode(sample_str)
print(tokenized_str)

[21, 442, 68, 923, 313, 1931, 843, 43, 419, 98, 3341]


In [8]:
original_str = tokenizer_title.decode(tokenized_str)
print(original_str)

为什么 商用 客机 一般 先要 卖 给 银行


In [9]:
len(corpus_format)

379641

In [10]:
def encode(lang):
    lang1, lang2 = lang
    lang1 = [tokenizer_title.vocab_size] + tokenizer_title.encode(lang1) + [tokenizer_title.vocab_size + 1]
    return [lang1, lang2]

def filter_long_sent(x, y, max_length=MAX_LENGTH):
    return tf.logical_and(tf.size(x) <= max_length, tf.size(y) <= max_length)

def pad_with_zero(lang, max_length=MAX_LENGTH):
    lang1, lang2 = lang
    n1 = MAX_LENGTH - len(lang1)
    lang1 = lang1 + [0 for k in range(n1)]
    return [lang1, lang2]

In [11]:
train_examples = [encode(k) for k in train_examples]
train_examples = [k for k in train_examples if len(k[0]) <= MAX_LENGTH]
train_examples = [pad_with_zero(k) for k in train_examples]
dic = {}
dic['title'] = [k[0] for k in train_examples]
dic['cat'] = [k[1] for k in train_examples]
train_examples = dic

In [12]:
train_dataset = tf.data.Dataset.from_tensor_slices(train_examples)

2023-06-12 22:50:53.234442: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2023-06-12 22:50:53.259610: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:43:00.0 name: NVIDIA GeForce RTX 3080 computeCapability: 8.6
coreClock: 1.71GHz coreCount: 68 deviceMemorySize: 9.77GiB deviceMemoryBandwidth: 707.88GiB/s
2023-06-12 22:50:53.259638: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2023-06-12 22:50:53.262756: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2023-06-12 22:50:53.262813: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2023-06-12 22:50:53.263758: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuff

In [13]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

# 使用缓存数据加速读入
train_dataset = train_dataset.cache()

# 打乱并获取批数据
train_dataset = train_dataset.batch(BATCH_SIZE)

# 设置预取数据
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
val_examples = [encode(k) for k in val_examples]
val_examples = [k for k in val_examples if len(k[0]) <= MAX_LENGTH]
val_examples = [pad_with_zero(k) for k in val_examples]
dic['title'] = [k[0] for k in val_examples]
dic['cat'] = [k[1] for k in val_examples]
val_examples = dic

In [15]:
val_dataset = tf.data.Dataset.from_tensor_slices(val_examples)

In [16]:
num_layers = 4
d_model = 128
dff = 512
target_vocab_size = tokenizer_title.vocab_size + 2
max_seq_len = MAX_LENGTH
dropout_rate = 0.1
n_class = len(cat_name_dic)

In [17]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

# 定义优化器
learing_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learing_rate, beta_1=0.9, 
                                     beta_2=0.98, epsilon=1e-9)

In [18]:
loss_object_fine_tuning = tf.keras.losses.CategoricalCrossentropy(from_logits=True, reduction='none')
train_loss_fine_tuning = tf.keras.metrics.Mean(name='train_loss_fine_tuning')
train_accuracy_fine_tuning = tf.keras.metrics.CategoricalAccuracy(name='train_accuracy_fine_tuning')

In [23]:
num_heads = 8

In [25]:
# 定义模型
class GPT1(tf.keras.Model):
    def __init__(self, n_layers, d_model, n_heads, diff,
                 target_vocab_size, 
                 max_seq_len, 
                 fine_tuning_class_num, 
                 drop_rate=0.1):
        super(GPT1, self).__init__()

        self.decoder = Decoder(n_layers, d_model, n_heads, diff,
                              target_vocab_size, max_seq_len, drop_rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
        self.fine_tuning_layer = tf.keras.layers.Dense(fine_tuning_class_num)
        
    def call(self, targets, training, look_ahead_mask):

        decode_out, att_weights = self.decoder(targets, training, 
                                               look_ahead_mask)
        final_out = self.final_layer(decode_out)
        # fine_tuning_out = self.fine_tuning_layer(tf.keras.layers.Flatten()(final_out))

        # return final_out, fine_tuning_out, att_weights
        return final_out, att_weights
    

In [26]:
class GPT1FT(tf.keras.Model):
    def __init__(self, gpt):
        super(GPT1FT, self).__init__()
        self.gpt = gpt
        
    def call(self, targets, training, look_ahead_mask):
        fo, _ = self.gpt(targets, training, look_ahead_mask)
        fine_tuning_out = self.gpt.fine_tuning_layer(tf.keras.layers.Flatten()(fo))
        
        return fine_tuning_out

In [22]:
num_heads = 8
gpt1 = GPT1(num_layers, d_model, num_heads, dff,
            target_vocab_size,
            max_seq_len, 
            n_class,
            dropout_rate)

In [27]:
gpt1_ft = GPT1FT(gpt1)

In [28]:
checkpoint_path = './checkpoint/train_cat'
ckpt = tf.train.Checkpoint(gpt1=gpt1,
                          optimizer=optimizer)
# ckpt管理器
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=3)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('last checkpoit restore')

last checkpoit restore


In [29]:
gpt1_ft.gpt.decoder.trainable = False
gpt1_ft.gpt.final_layer.trainable = False

In [30]:
# 构建掩码
def create_mask(targets):

    # look_ahead 掩码， 掩掉未预测的词
    look_ahead_mask = create_look_ahead_mark(tf.shape(targets)[1])
    
    # 解码层第一层得到padding掩码
    decode_targets_padding_mask = create_padding_mark(targets)

    # 合并解码层第一层掩码
    combine_mask = tf.maximum(decode_targets_padding_mask, look_ahead_mask)

    return combine_mask

def loss_fun(y_ture, y_pred):
    mask = tf.math.logical_not(tf.math.equal(y_ture, 0))  # 为0掩码标1
    loss_ = loss_object(y_ture, y_pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

def loss_fun_fine_tuning(y_ture, y_pred):
    loss_ = loss_object_fine_tuning(y_ture, y_pred)
    return tf.reduce_mean(loss_)

def train_step(targets):
    tar_inp = targets['title'][:, :-1]
    tar_real = targets['title'][:, 1:]
    cat_name = targets['cat']
    
    # 构造掩码
    combined_mask = create_mask(tar_inp)

    with tf.GradientTape() as tape:
        predict_fine_tuning = gpt1_ft(tar_inp, True, combined_mask)
        # loss = loss_fun(tar_real, predictions)
        loss_fine_tuning = loss_fun_fine_tuning(cat_name, predict_fine_tuning)
        # loss_combine = loss + loss_fine_tuning
        
    # 求梯度
    gradients = tape.gradient(loss_fine_tuning, gpt1_ft.trainable_variables)
    
    # 反向传播
    optimizer.apply_gradients(zip(gradients, gpt1_ft.trainable_variables))

    # 记录loss和准确率
    train_loss_fine_tuning(loss_fine_tuning)
    train_accuracy_fine_tuning(cat_name, predict_fine_tuning)

In [32]:
EPOCHS = 4
step_list = []
loss_list = []
loss_list_fine_tuning = []
step = 0

for epoch in range(EPOCHS):
    start = time.time()

    # 重置记录项
    train_loss_fine_tuning.reset_states()
    train_accuracy_fine_tuning.reset_states()

    for batch, all_inputs in enumerate(train_dataset):
        
        # 训练
        train_step(all_inputs)
        
        gpt1.summary()

        if batch % 1000 == 0:
            loss = train_loss.result()
            loss_fine_tuning = train_loss_fine_tuning.result()
            print('epoch {}, batch {}, loss:{:.4f}, loss_fine:{:.4f}, acc:{:.4f}'.format(
                epoch+1, batch, loss, loss_fine_tuning, train_accuracy_fine_tuning.result()
            )) 
            step_list.append(step)
            loss_list.append(loss)
            loss_list_fine_tuning.append(loss_fine_tuning)
        step += 1

    if (epoch + 1) % 2 == 0:
        ckpt_save_path = ckpt_manager.save()
        print('epoch {}, save model at {}'.format(epoch+1, ckpt_save_path))

    print('epoch {}, loss:{:.4f}, acc:{:.4f}'.format(epoch+1, train_loss.result(), train_accuracy.result()))
    print('time in 1 epoch:{} secs\n'.format(time.time()-start))

2023-06-13 21:24:34.282008: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


ValueError: Tensor's shape (8161, 128) is not compatible with supplied shape (8275, 128)

In [None]:
from tqdm import tqdm

def evaluate(inp_sentence):
    
    start_token = [tokenizer_title.vocab_size]
    end_token = [tokenizer_title.vocab_size + 1]
    inp_sentence = start_token + tokenizer_title.encode(inp_sentence) + end_token
    n = MAX_LENGTH - len(inp_sentence)
    inp_sentence = inp_sentence + [0 for k in range(n)]
    inp_sentence = inp_sentence[:-1]
    inp_sentence = tf.expand_dims(inp_sentence, 0)
    
    combined_mask = create_mask(inp_sentence)
    predictions, predict_fine_tuning, _ = gpt1(inp_sentence, False, combined_mask)
    predicted_id = tf.cast(tf.argmax(predict_fine_tuning, axis=-1), tf.int32)
    return predicted_id

def evaluate_func(val_dataset):
    predict = []
    real = []
    for k in tqdm(val_dataset):
        inp = tf.expand_dims(k['title'][:-1], 0)
        combined_mask = create_mask(inp)
        predictions, predict_fine_tuning, _ = gpt1(inp, False, combined_mask)
        predicted_id = tf.cast(tf.argmax(predict_fine_tuning, axis=-1), tf.int32)
        
        real_ = k['cat']
        s = list(real_.numpy()).index(1)
        real.append(s)
        predict += list(predicted_id.numpy())
    
    return predict, real

In [None]:
def get_cat_name(sentence, plot=''):
    result = evaluate(sentence)[0]
    result = cat_name_all[result]

    print('输入: {}'.format(sentence).replace(" ", ""))
    print('预测输出: {}'.format(result))

def get_real_cat(label):
    index = label.index(1)
    return cat_name_all[index]


In [None]:
predict, real = evaluate_func(val_dataset)
acc = np.sum(np.array(predict) == np.array(real)) / len(real)
print("验证集上的准确率：", acc)

In [None]:
s = "《狂飙》结局后，张译终于发声了，剧中演员回应一辈子不想见张译"
s = " ".join(jieba.cut(s))
get_cat_name(s)
print("==============================================================")
s = "教育部下发新通知，将调整今年的高考方向，家长看完心态“崩”了"
s = " ".join(jieba.cut(s))
get_cat_name(s)
print("==============================================================")
s = "俄罗斯学会了，发射大批气球飞向乌克兰，乌军导弹快不够用了"
s = " ".join(jieba.cut(s))
get_cat_name(s)

In [None]:
s = "今年小麦的产量低于往年"
s = " ".join(jieba.cut(s))
print(s)
get_cat_name(s)