## Understand GPT-2 model

- BERT, 分类模型；
- GPT-2, 生成模型；[Hugging Face Transformers/GPT2 Documents](https://huggingface.co/docs/transformers/en/model_doc/gpt2)

In [None]:
from torch.distributed.pipelining import pipeline
# Load model to local
from transformers import AutoModelForCausalLM,AutoTokenizer

# model_name = "uer/gpt2-chinese-lyric"
model_name = "uer/gpt2-chinese-poem"
cache_dir = "../local_models"
AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)

In [None]:
# Try the GPT2 model
from transformers import GPT2LMHeadModel, BertTokenizer, TextGenerationPipeline

# model_path="../local_models/models--uer--gpt2-chinese-cluecorpussmall/snapshots/c2c0249d8a2731f269414cc3b22dff021f8e07a3"
# model_path="../local_models/models--uer--gpt2-chinese-lyric/snapshots/4a42fd76daab07d9d7ff95c816160cfb7c21684f"
model_path="../local_models/models--uer--gpt2-chinese-poem/snapshots/6335c88ef6a3362dcdf2e988577b7bafeda6052b"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer,device="cpu")

prompt = "中文GPT2大规模预训练模型"
output = text_generator(prompt, max_length=100, do_sample=True)

print(model)
print(output)

## Train a GPT-2 base model to be a Poem model:

### Step 1: Load the dataset

In [None]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer
import torch

class PoemDataset(Dataset):
    def __init__(self, file_path):
        # Here, we are just reading the file. You can add custom pre-processing here
        with open(file_path, encoding="utf-8") as f:
            text = f.readlines()
        text = [i.strip() for i in text]
        self.text = text

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        return self.text[item]

dataset_train = PoemDataset(file_path="../local_datasets/Poem/chinese_poems.txt")
for data in dataset_train[:5]:
    print(data)

model_path="../local_models/models--uer--gpt2-chinese-cluecorpussmall/snapshots/c2c0249d8a2731f269414cc3b22dff021f8e07a3"
tokenizer = AutoTokenizer.from_pretrained(model_path)
def collate_fn(data):
    data = tokenizer.batch_encode_plus(
        data,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    data["labels"] = data["input_ids"].clone()
    return data

dataloader = torch.utils.data.DataLoader(
    dataset_train,
    batch_size=4,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn
)
print(f"Dataset length: {len(dataset_train)}")

### Step 2: Training the model

- BERT, Incremental training model;
- GPT-2, Full training model;

In [None]:
from transformers import AutoModelForCausalLM, AdamW
from transformers.optimization import get_scheduler
import torch

model_path="../local_models/models--uer--gpt2-chinese-cluecorpussmall/snapshots/c2c0249d8a2731f269414cc3b22dff021f8e07a3"
model = AutoModelForCausalLM.from_pretrained(model_path)

def run_train():
    global model
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    EPOCH = 30000
    model.to(DEVICE)

    optimizer = AdamW(model.parameters(), lr=2e-5) # lr 2e-5 - 5e-5
    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=len(dataloader)
    )

    model.train()
    for epoch in range(EPOCH):
        for i, data in enumerate(dataloader):
            for k in data.keys():
                data[k] = data[k].to(DEVICE)
            outputs = model(**data)
            loss = outputs.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            # Reset gradients
            optimizer.zero_grad()
            model.zero_grad()

            if i % 100 == 0:
                labels = data["labels"][:, 1:].contiguous() # Target
                out = outputs["logits"].argmax(dim=2)[:, :-1].contiguous() # Predictions
                select = labels != 0 # Select all tokens that are not <PAD>
                labels = labels[select]
                out = out[select]
                del select
                accuracy = (labels == out).sum().item() / labels.numel()
                lr = optimizer.state_dict()["param_groups"][0]["lr"]
                if lr == 0.0:
                    for param_group in optimizer.param_groups:
                        param_group["lr"] = 2e-5

                print(f"Epoch: {epoch}, Iteration: {i}, Loss: {loss.item()}, lr: {lr}, Accuracy: {accuracy}")

        torch.save(model.state_dict(), "params/model.pt")
        print("Model saved!")

# Trigger the training
# run_train()

### Step 3: Generate the Poem with the fine-tuned model

- After training the model, we saved the weight.
- Now, we can load the weight and generate the poem.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TextGenerationPipeline
import torch

model_path="../local_models/models--uer--gpt2-chinese-cluecorpussmall/snapshots/c2c0249d8a2731f269414cc3b22dff021f8e07a3"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

trained_parameter_path = "../local_params/net-2.pt"
loaded_object = torch.load(trained_parameter_path, map_location=torch.device("cpu"))
model.load_state_dict(loaded_object)

# Default pipeline to generate
pipeline = TextGenerationPipeline(model, tokenizer, device=0)
print(pipeline("白", max_length=24))

# Custom pipeline to generate
# 用于生成5言绝句 text是提示词，row是生成文本的行数，col是每行的字符数。
def generate(text, row, col):

    #定义一个内部递归函数，用于生成文本
    def generate_loop(data):
        #禁用梯度计算
        with torch.no_grad():
            #使用data字典中的数据作为模型输入，并获取输出
            out = model(**data)
        #获取最后一个字(logits未归一化的概率输出)
        out = out["logits"]
        #选择每个序列的最后一个logits，对应于下一个词的预测
        out = out[:,-1]

        #找到概率排名前50的值，以此为分界线，小于该值的全部舍去
        topk_value = torch.topk(out,50).values
        #获取每个输出序列中前50个最大的logits（为保持原维度不变，需要对结果增加一个维度，因为索引操作会降维）
        topk_value = topk_value[:,-1].unsqueeze(dim=1)
        #将所有小于第50大的值的logits设置为负无穷，减少低概率词的选择
        out = out.masked_fill(out < topk_value,-float("inf"))

        #将特殊符号的logits值设置为负无穷，防止模型生成这些符号。
        for i in ",.()《《[]「」{}":
            out[:,tokenizer.get_vocab()[i]] = -float('inf')
        out[:,tokenizer.get_vocab()["[PAD]"]] = -float('inf')
        out[:,tokenizer.get_vocab()["[UNK]"]] = -float('inf')
        out[:,tokenizer.get_vocab()["[CLS]"]] = -float('inf')
        out[:,tokenizer.get_vocab()["[SEP]"]] = -float('inf')

        #根据概率采样，无放回，避免生成重复的内容
        out = out.softmax(dim=1)
        #从概率分布中进行采样，选择下一个词的ID
        out = out.multinomial(num_samples=1)

        #强值添加标点符号
        #计算当前生成的文本长度于预期的长度的比例
        c = data["input_ids"].shape[1] / (col+1)
        #如果当前的长度是预期长度的整数倍，则添加标点符号
        if c % 1 ==0:
            if c % 2 ==0:
                #在偶数位添加句号
                out[:,0] = tokenizer.get_vocab()["."]
            else:
                #在奇数位添加逗号
                out[:,0] = tokenizer.get_vocab()[","]
        #将生成的新词ID添加到输入序列的末尾
        data["input_ids"] = torch.cat([data["input_ids"],out],dim=1)
        #更新注意力掩码，标记所有有效位置
        data["attention_mask"] = torch.ones_like(data["input_ids"])
        #更新token的ID类型，通常在BERTm模型中使用，但是在GPT模型中是不用的
        data["token_type_ids"] = torch.ones_like(data["input_ids"])
        #更新标签，这里将输入ID复制到标签中，在语言生成模型中通常用与预测下一个词
        data["labels"] = data["input_ids"].clone()

        #检查生成的文本长度是否达到或超过指定的行数和列数
        if data["input_ids"].shape[1] >= row*col + row+1:
            #如果达到长度要求，则返回最终的data字典
            return data
        #如果长度未达到要求，递归调用generate_loop函数继续生成文本
        return generate_loop(data)

    #生成3首诗词
    #使用tokenizer对输入文本进行编码，并重复3次生成3个样本。
    data = tokenizer.batch_encode_plus([text] * 3, return_tensors="pt")
    #移除编码后的序列中的最后一个token(结束符号)
    data["input_ids"] = data["input_ids"][:,:-1]
    #创建一个与input_ids形状相同的全1张量，用于注意力掩码
    data["attention_mask"] = torch.ones_like(data["input_ids"])
    # 创建一个与input_ids形状相同的全0张量，用于token类型ID
    data["token_type_ids"] = torch.zeros_like(data["input_ids"])
    #复制input_ids到labels，用于模型的目标
    data['labels'] = data["input_ids"].clone()

    #调用generate_loop函数开始生成文本
    data = generate_loop(data)

    #遍历生成的3个样本
    for i in range(3):
        #打印输出样本索引和对应的解码后的文本
        print(i,tokenizer.decode(data["input_ids"][i]))

generate("白",row=4,col=5)