## Understand GPT-2 model

- BERT, 分类模型；
- GPT-2, 生成模型；[Hugging Face Transformers/GPT2 Documents](https://huggingface.co/docs/transformers/en/model_doc/gpt2)

In [12]:
# load model to local

from transformers import AutoModelForCausalLM,AutoTokenizer

# model_name = "uer/gpt2-chinese-lyric"
model_name = "uer/gpt2-chinese-poem"
cache_dir = "../local_models"
AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)

BertTokenizerFast(name_or_path='uer/gpt2-chinese-poem', vocab_size=22557, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [13]:
# Try the GPT2 model

from transformers import GPT2LMHeadModel, BertTokenizer, TextGenerationPipeline

# model_path="../local_models/models--uer--gpt2-chinese-cluecorpussmall/snapshots/c2c0249d8a2731f269414cc3b22dff021f8e07a3"
# model_path="../local_models/models--uer--gpt2-chinese-lyric/snapshots/4a42fd76daab07d9d7ff95c816160cfb7c21684f"
model_path="../local_models/models--uer--gpt2-chinese-poem/snapshots/6335c88ef6a3362dcdf2e988577b7bafeda6052b"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer,device="cpu")

prompt = "中文GPT2大规模预训练模型"
output = text_generator(prompt, max_length=100, do_sample=True)

print(model)
print(output)

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(22557, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=22557, bias=False)
)
[{'generated_text': '中文GPT2大规模预训练模型 ， 大 法 非 难 解 。

## Train a GPT-2 base model to be a Poem model:

### Step 1: Load the dataset

In [18]:
from torch.utils.data import Dataset
import torch

class PoemDataset(Dataset):
    def __init__(self, file_path):
        # Here, we are just reading the file. You can add custom pre-processing here
        with open(file_path, encoding="utf-8") as f:
            text = f.readlines()
        text = [i.strip() for i in text]
        self.text = text

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        return self.text[item]

dataset = PoemDataset("../local_datasets/Poem/chinese_poems.txt")

for data in dataset[:5]:
    print(data)

欲出未出光辣达,千山万山如火发.须臾走向天上来,逐却残星赶却月.
满目江山四望幽,白云高卷嶂烟收.日回禽影穿疏木,风递猿声入小楼.远岫似屏横碧落,断帆如叶截中流.
片片飞来静又闲,楼头江上复山前.飘零尽日不归去,帖破清光万里天.
因登巨石知来处,勃勃元生绿藓痕.静即等闲藏草木,动时顷刻徧乾坤.横天未必朋元恶,捧日还曾瑞至尊.不独朝朝在巫峡,楚王何事谩劳魂.
一气东南王斗牛,祖龙潜为子孙忧.金陵地脉何曾断,不觉真人已姓刘.


### Step 2: Training the model

- BERT, Incremental training model;
- GPT-2, Full training model;

In [23]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW
from transformers.optimization import get_scheduler
import torch

model_path="../local_models/models--uer--gpt2-chinese-cluecorpussmall/snapshots/c2c0249d8a2731f269414cc3b22dff021f8e07a3"
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

dataset_train = PoemDataset(file_path="../local_datasets/Poem/chinese_poems.txt")
def collate_fn(data):
    data = tokenizer.batch_encode_plus(
        data,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    data["labels"] = data["input_ids"].clone()
    return data

dataloader = torch.utils.data.DataLoader(
    dataset_train,
    batch_size=4,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn
)
print(f"Dataset length: {len(dataset_train)}")

def train():
    global model
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(DEVICE)

    optimizer = AdamW(model.parameters(), lr=2e-5) # lr 2e-5 - 5e-5
    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=len(dataloader)
    )

    model.train()

    for epoch in range(3):
        for i, data in enumerate(dataloader):
            for k in data.keys():
                data[k] = data[k].to(DEVICE)
            outputs = model(**data)
            loss = outputs.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            # Reset gradients
            optimizer.zero_grad()
            model.zero_grad()

            if i % 100 == 0:
                labels = data["labels"][:, 1:].contiguous() # Target
                out = outputs["logits"].argmax(dim=2)[:, :-1].contiguous() # Predictions
                select = labels != 0 # Select all tokens that are not <PAD>
                labels = labels[select]
                out = out[select]
                del select
                accuracy = (labels == out).sum().item() / labels.numel()
                lr = optimizer.state_dict()["param_groups"][0]["lr"]

                print(f"Epoch: {epoch}, Iteration: {i}, Loss: {loss.item()}, lr: {lr}, Accuracy: {accuracy}")

        torch.save(model.state_dict(), "params/model.pt")
        print("Model saved!")

train()

Dataset length: 304752
Epoch: 0, Iteration: 0, Loss: 9.866423606872559, lr: 1.9999737491468474e-05, Accuracy: 0.06976744186046512
Epoch: 0, Iteration: 100, Loss: 3.88360595703125, lr: 1.9973486638315748e-05, Accuracy: 0.18627450980392157
Epoch: 0, Iteration: 200, Loss: 3.2421789169311523, lr: 1.9947235785163018e-05, Accuracy: 0.19230769230769232
Epoch: 0, Iteration: 300, Loss: 4.859396457672119, lr: 1.9920984932010292e-05, Accuracy: 0.18292682926829268
Epoch: 0, Iteration: 400, Loss: 4.392740726470947, lr: 1.9894734078857565e-05, Accuracy: 0.2012987012987013
Epoch: 0, Iteration: 500, Loss: 5.057992935180664, lr: 1.986848322570484e-05, Accuracy: 0.19672131147540983


KeyboardInterrupt: 