In [11]:
# 导入必要包
import torch
from torch.utils.data import DataLoader, Dataset
import json
import datasets

In [33]:
# 准备数据
# datapath = '../../data/alpaca_gpt4_data_dev.json'  # dev数据只有少量数据，用于开发，实际训练时请使用full数据集
datapath = '../../data/alpaca_gpt4_data.json'  # full数据集

# 定义tokenizer
from transformers import GPT2Tokenizer
model_path = "F:\llm-deploy\docs\chapter2\models\GPT-2"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# 定义数据集类
def get_flat_data(datapath):
    data = json.load(open(datapath, 'r'))
    inputs = []  # 指令和输入
    labels = []  # 输出

    for item in data:
        inputs.append(f"{item['instruction']} {item['input']}")
        labels.append(item['output']) 
    return {
        'input': inputs,
        'label': labels
    }

def tokenize(example):
    tokenizer.pad_token = tokenizer.eos_token
    input_token = tokenizer(
        example['input'],
        max_length=512,
        truncation=True
    )
    iids = input_token['input_ids']
    label_token = tokenizer(
        example['label'],
        max_length=512,
        truncation=True
    )
    lids = label_token['input_ids']

    lengths = []
    input_ids = []
    label_ids = []
    attention_mask = []
    for iid, lid in zip(iids, lids):
        lengths.append(len(iid) + len(lid))
        input_ids.append(iid + lid)
        label_ids.append([-100]*len(iid) + lid)
        attention_mask.append([1]*(len(iid) + len(lid)))
    
    lengths = torch.tensor(lengths)
    pad_length = (lengths.max() - lengths).tolist()
    for i, l in enumerate(pad_length):
        input_ids[i] = [tokenizer.pad_token_id]*l + input_ids[i]
        attention_mask[i] = [0]*l + attention_mask[i]
        label_ids[i] = [-100]*l + label_ids[i]
    return {
        "input_ids": torch.tensor(input_ids, dtype=torch.long),
        "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
        "labels": torch.tensor(label_ids, dtype=torch.long)
    }

# 创建数据集
flatdata = get_flat_data(datapath)
dataset = datasets.Dataset.from_dict(flatdata)
dataset = dataset.map(tokenize, batched=True, batch_size=len(dataset))  # 使用map方法对数据集进行批处理
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
    

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [26]:
# train

# 定义模型
from transformers import GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained(model_path)

def train_step(batch):
    kwargs = {
        "input_ids": batch["input_ids"],
        "attention_mask": batch["attention_mask"],
        "labels": batch["labels"],
    }
    res = model(**kwargs)["loss"]
    return res

def val():
    model.eval()
    model.to('cpu')
    for step, batch in enumerate(dataloader):
        loss = train_step(batch)
        if step % 10 == 0:
            print(f"val step: {step}, loss: {loss.item()}")

def train():
    model.train()
    model.to('cpu')
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    for epoch in range(3):
        for step, batch in enumerate(dataloader):
            loss = train_step(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            if step % 10 == 0:
                print(f"epoch: {epoch}, step: {step}, loss: {loss.item()}")
        torch.save(model.state_dict(), f"model_{epoch}.pt")
        val()

train()

epoch: 0, step: 0, loss: 8.52311897277832
val step: 0, loss: 5.500304698944092
epoch: 1, step: 0, loss: 5.841464042663574
val step: 0, loss: 4.260898113250732
epoch: 2, step: 0, loss: 4.249709129333496
val step: 0, loss: 3.278724193572998


In [32]:
# 测试效果
def test(trained=True):
    gpt2 = GPT2LMHeadModel.from_pretrained(model_path)
    if trained:
        gpt2.load_state_dict(torch.load('model_2.pt'))
    gpt2.eval()
    gpt2.to('cpu')
    
    text = "Please let me know your thoughts on the given place and why you think it deserves to be visited. \"Barcelona, Spain\""
    encoded_input = tokenizer(text, return_tensors='pt')
    output = gpt2(**encoded_input)
    logits = output.logits
    predicted_index = torch.argmax(logits, dim=-1)
    predicted_text = tokenizer.decode(predicted_index[0])
    print(predicted_text)

test(trained=True)
test(trained=False)

. me know if thoughts on this matter.. time you think it's to be here.

celona is Spain"

. me know if thoughts on this matter.. time you think it's to be here.

celona" Spain"

