# 一、 加载Dataset数据集




In [16]:
import json
with open("instruction-data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print(data[1])
print(len(data))

{'instruction': 'Edit the following sentence for grammar.', 'input': 'He go to the park every day.', 'output': 'He goes to the park every day.'}
1100


# 二、 送训练 request和response格式

```javascript 

"Below is an instruction that describes a task."
"Write a response that appropriately completes the request."
"\n\n### Instruction:\n   instruction"

"\n\n### Input:\n input" 

 
"\n\n### Response:\noutput"
```


In [17]:
def format_input(item):
    instruction_text = (
        f"Below is an instruction that describes a task."
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{item['instruction']}"
    )

    input_text = f"\n\n### Input:\n{item['input']}" if item["input"] else ""

    return instruction_text + input_text

myinput = format_input(data[50])
response = f"\n\n### Response:\n{data[50]['output']}"
print(myinput+response)

Below is an instruction that describes a task.Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Response:
The correct spelling is 'Occasion.'


# 三、 训练数据集、验证数据集、测试数据解的比列

训练数据集:80%

验证数据集:10%

测试数据集:10%

In [18]:
#8:1:1
train_part = (int)(len(data) *0.8)
val_part = (int)(len(data)*0.1)
test_part = len(data)-train_part - val_part

train_data = data[:train_part]
val_data = data[train_part:train_part+val_part]
test_data = data[train_part+val_part:]

print("train set length:", len(train_data))
print("val set length:", len(val_data))
print("test set length:", len(test_data))

train set length: 880
val set length: 110
test set length: 110


# 四、构建Dataset的数据集


In [19]:
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.samples = []
        for i in data:
            input = format_input(i)
            response = f"\n\n### Response:\n{i['output']}"
            full_text = input + response
            self.samples.append(
                tokenizer.encode(full_text)
            )

    def __getitem__(self, index):
        return self.samples[index]

    def __len__(self):
        return len(self.data)

In [20]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [21]:
import torch;

def my_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_token_id=-100,
    allowed_max_length=None, # 最大
    device="cpu"  # 默认cpu设备
):
    # 获取这个批次中最长样本的长度
    batch_max_len = max(len(i) +1 for i in batch );

    # 输入列表和目标列表
    input_list, target_list = [], [];

    for i in batch:
        # 1. 将这个批次中小于批次最大长度的所有样本进行填充

        # 2. 根据输入创建targets

        # 3. 将targets中填充的token_id替换成-100(除了第一个的填充tokenid之外)
        new_item = i + [pad_token_id];
        padded = new_item +[pad_token_id] * (batch_max_len - len(new_item));

        inputs = torch.tensor( padded[:-1]);
        targets = torch.tensor(padded[1:]);

        # targets  = [ 1 2 3   50256   50256 50256 ...]
        # mask = [False, False, False True, True ...] 
        mask = targets == pad_token_id;

        slice = torch.nonzero(mask).sqeeze();

        # 不只一个大于1
        if slice.numel() > 1:
            targets[slice[1:]] = ignore_token_id;

        if allowed_max_length is not None:
            inputs = input[:allowed_max_length];
            targets = targets[:allowed_max_length];

        input_list.append(inputs);
        target_list.append(targets);

    input_tensor = torch.stack(input_list).to(device);

    target_tensor = torch.stack(target_list); #.to(device);

    return input_tensor, target_tensor;

In [22]:
test_mask = [False, True, False, True, True, False, True, True];
# 打印非 False的下标
print(torch.nonzero(torch.tensor(test_mask)));
# 二维变一维
slice = torch.nonzero(torch.tensor(test_mask)).squeeze();
print(slice);


targets = torch.tensor([1, 34, 4, 5, 6, 50256, 50256, 50256]);
# 前面三项不动 改变后面的数字
targets[slice[1:]] = -100;
print(targets)


tensor([[1],
        [3],
        [4],
        [6],
        [7]])
tensor([1, 3, 4, 6, 7])
tensor([    1,    34,     4,  -100,  -100, 50256,  -100,  -100])


In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from functools import partial
customized_collate_fn = partial(my_collate_fn, 
                                device=device,
                                allowed_max_length=1024)
print(device)

cuda


# 五、加载 训练、验证数据集

In [24]:
#  加载数据集
from torch.utils.data import DataLoader

batch_size=8

torch.manual_seed(123)
train_dataset = InstructionDataset(train_data, tokenizer)
train_dataloader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    collate_fn = customized_collate_fn,
    shuffle = True,
    drop_last = True
)

#验证数据集
val_dataset = InstructionDataset(val_data, tokenizer)
val_dataloader = DataLoader(
    val_dataset,
    batch_size = batch_size,
    collate_fn = customized_collate_fn,
    shuffle = False,
    drop_last = False
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_dataloader = DataLoader(
    test_dataset,
    batch_size = batch_size,
    collate_fn = customized_collate_fn,
    shuffle = False,
    drop_last = False
)

# 六、加载OpenAi-GPT2模型

In [27]:
from GPTModel import MyGPTModel, generate_new, text_to_tokenids, tokenids_to_text
from load_gpt2_model import load_gpt2_weights
import torch

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "max_seq_length": 1024,
    "embedding_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": True
}
model = MyGPTModel(GPT_CONFIG_124M)

load_gpt2_weights(model, GPT_CONFIG_124M)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"模型已加载至：{device}")



[1/3] 正在从 Hugging Face 下载/加载 gpt2 模型...


'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /openai-community/gpt2/resolve/main/config.json (Caused by ConnectTimeoutError(<HTTPSConnection(host='huggingface.co', port=443) at 0x23440041690>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 83d4a383-61a4-477e-bccb-614d21ccd46c)')' thrown while requesting HEAD https://huggingface.co/openai-community/gpt2/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /openai-community/gpt2/resolve/main/config.json (Caused by ConnectTimeoutError(<HTTPSConnection(host='huggingface.co', port=443) at 0x23440041900>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: fbfc112f-f3a9-4cd1-8f67-7e54cea96a95)')' thrown while requesting HEAD https://huggingface.co/openai-community/gpt2/resolve/main/config.json
Retrying in 2s [Retry 2/5].


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

[2/3] 开始权重移植...
  -> 正在加载 Embeddings (wte, wpe)...
  -> 正在加载 12 层 Transformer Block...
  -> 正在加载 Final LayerNorm & Head...
[3/3] 成功！GPT-2 权重已全部加载完成。

模型已加载至：cuda
