# 指令微调

## 下载数据集

In [1]:
import json
import os
import urllib

def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else:
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
        return data
file_path = "instruction-data.json"
url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
       "/main/ch07/01_main-chapter-code/instruction-data.json")

data = download_and_load_file(file_path, url)

In [2]:
print("amount of data: ", len(data))

amount of data:  1100


In [3]:
print(data[50])
print(data[999])

{'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}
{'instruction': "What is an antonym of 'complicated'?", 'input': '', 'output': "An antonym of 'complicated' is 'simple'."}


In [4]:
def format_input(entry):
    instruction = (
        f"以下是一个描述任务的指令。\n"
        f"请写出一个适当完成请求的响应\n\n"
        f"### 指令：\n{entry['instruction']}"
    )
    input_text = (
        f"\n\n###输入:\n{entry['input']}" if entry['input'] else ""
    )
    return instruction + input_text

In [5]:
model_input = format_input(data[50])
desired_response = f"\n\n### 响应：\n{data[50]['output']}"
print(model_input + desired_response)

以下是一个描述任务的指令。
请写出一个适当完成请求的响应

### 指令：
Identify the correct spelling of the following word.

###输入:
Ocassion

### 响应：
The correct spelling is 'Occasion.'


In [6]:
model_input = format_input(data[999])
desired_response = f"\n\n### 响应：\n{data[999]['output']}"
print(model_input + desired_response)






以下是一个描述任务的指令。
请写出一个适当完成请求的响应

### 指令：
What is an antonym of 'complicated'?

### 响应：
An antonym of 'complicated' is 'simple'.


In [7]:
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

print(f"train_data: {len(train_data)}")
print(f"test_data: {len(test_data)}")
print(f"val_data: {len(val_data)}")








train_data: 935
test_data: 110
val_data: 55


## 实现指令数据集类

In [8]:
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data =data
        self.encoded_text = []

        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### 响应：\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_text.append(tokenizer.encode(full_text))

    def __getitem__(self, index):
        return self.encoded_text[index]

    def __len__(self):
        return len(self.data)

In [9]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
id = 50256
print(tokenizer.decode([id]))
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

<|endoftext|>
[50256]


In [10]:
def custom_collate_draft_1(batch,pad_token_id=50256,device="cpu"):
    batch_max_length = max(len(item)+1 for item in batch)
    inputs_lst = []

    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id] * (batch_max_length - len(new_item))
        input = torch.tensor(new_item[:-1])
        inputs_lst.append(input)

    inputs_tensor = torch.stack(inputs_lst).to(device)
    return inputs_tensor

In [11]:
inputs_1 = [0,1,2,3,4]
inputs_2 = [5,6]
inputs_3 = [7,8,9]
batch = [inputs_1, inputs_2, inputs_3]
print(custom_collate_draft_1(batch))
    

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])


In [12]:
def custom_collate_draft_2(batch,pad_token_id=50256,device="cpu"):
    batch_max_length = max(len(item)+1 for item in batch)
    inputs_lst, targets_lst = [], []
    for item in batch:
        new_item = item.copy()
        new_item +=[pad_token_id]
        inputs_lst.append(new_item[:-1])
        targets_lst.append(new_item[1:])

    for i in range(len(inputs_lst)):
        while len(inputs_lst[i]) < batch_max_length - 1:
            inputs_lst[i].append(pad_token_id)
        while len(targets_lst[i]) < batch_max_length - 1:
                targets_lst[i].append(-100)

        
    return torch.tensor(inputs_lst, device=device),torch.tensor(targets_lst,device=device)

In [13]:
print(custom_collate_draft_2(batch))

(tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]]), tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]]))


In [14]:
def custom_collate_fn(batch, pad_token_id=50256,ignore_index=-100,allowed_max_length=None,device="cpu"):
    batch_max_length = max(len(item)+1 for item in batch)
    inputs_lst, targets_lst = [], []
    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]
        padded = (new_item + [pad_token_id] * (batch_max_length - len(new_item)))
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]
        inputs_lst.append(inputs)
        targets_lst.append(targets)
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    return inputs_tensor, targets_tensor





In [15]:
inputs, targets = custom_collate_fn(batch)
print(inputs)
print(targets)





tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]])


In [16]:
logits_1 = torch.tensor([[-1.0,1.0],[-0.5,1.5],[1.5,-1]])
targets_1 = torch.tensor([0,1,-100])
loss_1 = torch.nn.functional.cross_entropy(logits_1, targets_1)
print(loss_1)

tensor(1.1269)


In [17]:
device = torch.device("mps")
print(device)

mps


In [18]:
from functools import partial
customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=1024)

In [19]:
from torch.utils.data import DataLoader
num_workers = 0
batch_size = 8
torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    collate_fn = customized_collate_fn,
    shuffle = True,
    drop_last = True,
    num_workers = num_workers
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size = batch_size,
    collate_fn = customized_collate_fn,
    shuffle = False,
    drop_last = False,
    num_workers = num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size = batch_size,
    collate_fn = customized_collate_fn,
    shuffle = False,
    drop_last = False,
    num_workers = num_workers
)

In [20]:
for inputs, targets in train_loader:
    print(inputs.shape)
    print(targets.shape)



torch.Size([8, 112])
torch.Size([8, 112])
torch.Size([8, 123])
torch.Size([8, 123])
torch.Size([8, 124])
torch.Size([8, 124])
torch.Size([8, 119])
torch.Size([8, 119])
torch.Size([8, 116])
torch.Size([8, 116])
torch.Size([8, 119])
torch.Size([8, 119])
torch.Size([8, 131])
torch.Size([8, 131])
torch.Size([8, 118])
torch.Size([8, 118])
torch.Size([8, 113])
torch.Size([8, 113])
torch.Size([8, 126])
torch.Size([8, 126])
torch.Size([8, 109])
torch.Size([8, 109])
torch.Size([8, 119])
torch.Size([8, 119])
torch.Size([8, 118])
torch.Size([8, 118])
torch.Size([8, 124])
torch.Size([8, 124])
torch.Size([8, 116])
torch.Size([8, 116])
torch.Size([8, 126])
torch.Size([8, 126])
torch.Size([8, 118])
torch.Size([8, 118])
torch.Size([8, 117])
torch.Size([8, 117])
torch.Size([8, 130])
torch.Size([8, 130])
torch.Size([8, 119])
torch.Size([8, 119])
torch.Size([8, 127])
torch.Size([8, 127])
torch.Size([8, 119])
torch.Size([8, 119])
torch.Size([8, 120])
torch.Size([8, 120])
torch.Size([8, 116])
torch.Size([8

In [22]:
from gpt_download import download_and_load_gpt2

In [23]:
import torch
import torch.nn as nn

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"],bias=False)

    def forward(self, in_idx):
        device = in_idx.device
        batch_size, seq_len = in_idx.shape

        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in = cfg["emb_dim"],
            d_out = cfg["emb_dim"],
            context_length = cfg["context_length"],
            dropout = cfg["drop_rate"],
            num_heads = cfg["n_heads"],
            qkv_bias = cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
    
    def forward(self, x):

        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out) #构造一个线性层，将多头注意力的输出投影到与输入相同的维度
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask',torch.triu(torch.ones(context_length,context_length),diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape #批次，token数量，输入向量维度
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        attn_scores = queries @ keys.transpose(2, 3)
        mask_bool = self.mask.bool()[:num_tokens,:num_tokens]

        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1, 2)

        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)

        context_vec = self.out_proj(context_vec)

        return context_vec
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5 # 防止方差为0
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    
    def forward(self, x):
        mean = x.mean(dim=-1,keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4*cfg["emb_dim"]),
            GELU(),
            nn.Linear(4*cfg["emb_dim"], cfg["emb_dim"])
        )
    
    def forward(self, x):
        return self.layers(x)

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x,3))))

In [24]:
def load_weights_safely(model, params):
    """安全地加载权重，包含错误处理"""
    try:
        # 1. 加载基础嵌入权重
        print("加载基础嵌入权重...")
        model.pos_emb.weight = assign(model.pos_emb.weight, params["wpe"])
        model.tok_emb.weight = assign(model.tok_emb.weight, params["wte"])
        print("✓ 基础嵌入权重加载成功")
        
        # 2. 加载transformer块权重
        print(f"加载 {len(params['blocks'])} 个transformer块的权重...")
        for b in range(len(params["blocks"])):
            # 注意力权重
            q_w, k_w, v_w = np.split(params["blocks"][b]["attn"]["c_attn"]["w"], 3, axis=-1)
            model.trf_blocks[b].att.W_query.weight = assign(model.trf_blocks[b].att.W_query.weight, q_w.T)
            model.trf_blocks[b].att.W_key.weight = assign(model.trf_blocks[b].att.W_key.weight, k_w.T)
            model.trf_blocks[b].att.W_value.weight = assign(model.trf_blocks[b].att.W_value.weight, v_w.T)
            
            # 注意力偏置
            q_bias, k_bias, v_bias = np.split(params["blocks"][b]["attn"]["c_attn"]["b"], 3, axis=-1)
            model.trf_blocks[b].att.W_query.bias = assign(model.trf_blocks[b].att.W_query.bias, q_bias)
            model.trf_blocks[b].att.W_key.bias = assign(model.trf_blocks[b].att.W_key.bias, k_bias)
            model.trf_blocks[b].att.W_value.bias = assign(model.trf_blocks[b].att.W_value.bias, v_bias)
            
            # 注意力输出投影
            model.trf_blocks[b].att.out_proj.weight = assign(model.trf_blocks[b].att.out_proj.weight, params["blocks"][b]["attn"]["c_proj"]["w"].T)
            model.trf_blocks[b].att.out_proj.bias = assign(model.trf_blocks[b].att.out_proj.bias, params["blocks"][b]["attn"]["c_proj"]["b"])
            
            # 前馈网络权重
            model.trf_blocks[b].ff.layers[0].weight = assign(model.trf_blocks[b].ff.layers[0].weight, params["blocks"][b]["mlp"]["c_fc"]["w"].T)
            model.trf_blocks[b].ff.layers[0].bias = assign(model.trf_blocks[b].ff.layers[0].bias, params["blocks"][b]["mlp"]["c_fc"]["b"])
            model.trf_blocks[b].ff.layers[2].weight = assign(model.trf_blocks[b].ff.layers[2].weight, params["blocks"][b]["mlp"]["c_proj"]["w"].T)
            model.trf_blocks[b].ff.layers[2].bias = assign(model.trf_blocks[b].ff.layers[2].bias, params["blocks"][b]["mlp"]["c_proj"]["b"])
            
            # 层归一化权重
            model.trf_blocks[b].norm1.scale = assign(model.trf_blocks[b].norm1.scale, params["blocks"][b]["ln_1"]["g"])
            model.trf_blocks[b].norm1.shift = assign(model.trf_blocks[b].norm1.shift, params["blocks"][b]["ln_1"]["b"])
            model.trf_blocks[b].norm2.scale = assign(model.trf_blocks[b].norm2.scale, params["blocks"][b]["ln_2"]["g"])
            model.trf_blocks[b].norm2.shift = assign(model.trf_blocks[b].norm2.shift, params["blocks"][b]["ln_2"]["b"])
            
            if (b + 1) % 4 == 0:  # 每4个块打印一次进度
                print(f"✓ 完成块 {b + 1}/{len(params['blocks'])}")
        
        # 3. 加载最终层权重
        print("加载最终层权重...")
        model.final_norm.scale = assign(model.final_norm.scale, params["g"])
        model.final_norm.shift = assign(model.final_norm.shift, params["b"])
        model.out_head.weight = assign(model.out_head.weight, params["wte"])
        print("✓ 最终层权重加载成功")
        
        print("🎉 所有权重加载完成！")
        return True
        
    except Exception as e:
        print(f"❌ 权重加载失败: {e}")
        import traceback
        traceback.print_exc()
        return False

In [None]:
BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0
    "qkv_bias": True
}

model_configs = {
    "gpt2-small(124M)": {"emb_dim": 768, "n_heads": 12, "n_layers": 12},
    "gpt2-medium(355M)": {"emb_dim": 1024, "n_heads": 16, "n_layers": 24},
    "gpt2-large(760M)": {"emb_dim": 1280, "n_heads": 20, "n_layers": 36},
    "gpt2-xl(1300M)": {"emb_dim": 1600, "n_heads": 25, "n_layers": 48}
}
CHOOSE_MODEL = "gpt2-medium(355M)"
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")

settings, params = download_and_load_gpt2(model_size, models_dir="gpt2")

