In [None]:
import json
import os
import urllib
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken
from functools import partial
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.nn import functional as F
import time

### 下载数据集

In [77]:
def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as respons:
            text_data = respons.read().decode("utf-8")
        with open (file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else:
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

file_path = "../dataset/instruction-data.json" 
url = ( 
 "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch" 
 "/main/ch07/01_main-chapter-code/instruction-data.json" 
) 
data = download_and_load_file(file_path, url) 
print("Number of entries:", len(data))

Number of entries: 1100


In [78]:
print("Example entry:\n", data[99])

Example entry:
 {'instruction': "Provide a synonym for 'kind'.", 'input': '', 'output': "A synonym for 'kind' is 'benevolent'."}


### 提示词风格

#### 将指令集中的数据转换为两种提示词风格，Alpac和Phi-3

In [79]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request." 
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    input_text = (
        f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    )
    return instruction_text + input_text

### 划分数据集

In [80]:
train_data, temp_data = train_test_split(data, train_size=0.85, random_state=42)
test_data, val_data = train_test_split(temp_data, test_size=0.33, random_state=42)

print(f"训练集: {len(train_data)}")
print(f"验证集: {len(val_data)}")
print(f"测试集: {len(test_data)}")

训练集: 935
验证集: 55
测试集: 110


#### dataset与dataloader

In [81]:
class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_text = []
        for entry in data:
            instruction_inputs = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_inputs + response_text
            self.encoded_text.append(tokenizer.encode(full_text))
    def __getitem__(self, index):
        return self.encoded_text[index]
    def __len__(self):
        return len(self.data)

In [82]:
def custom_collate_fn(batch, pad_token=50256, ignore_token=-100, allowed_max_length=None,device="cpu"):
    batch_max_lemgth = max(len(item) + 1 for item in batch)
    input_lst, target_lst = [], []

    for item in batch:
        new_item = item.copy()
        new_item += [pad_token]
        padded = (new_item + [pad_token] * (batch_max_lemgth - len(new_item)))
        inputs = padded[:-1]
        targets = padded[1:]

        inputs = torch.tensor(inputs, dtype=torch.long, device=device)
        targets = torch.tensor(targets, dtype=torch.long, device=device)
        mask = targets == pad_token
        indices = torch.nonzero(mask).squeeze() 
        if indices.numel() > 1: 
            targets[indices[1:]] = ignore_token
        
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]
        input_lst.append(inputs)
        target_lst.append(targets)
    
    input_tensor = torch.stack(input_lst).to(device)
    target_tensor = torch.stack(target_lst).to(device)
    return input_tensor, target_tensor

In [83]:
inputs_1 = [0, 1, 2, 3, 4] 
inputs_2 = [5, 6] 
inputs_3 = [7, 8, 9] 
batch = ( 
 inputs_1, 
 inputs_2, 
 inputs_3 
) 
print(custom_collate_fn(batch))

(tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]]), tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]]))


In [84]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
collate_fn = partial(
    custom_collate_fn,
    device=device,
    allowed_max_length=1024
)

In [85]:
num_works = 0
batch_size = 8
tokenizer = tiktoken.get_encoding("gpt2")
torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
val_dataset = InstructionDataset(val_data, tokenizer)
test_dataset = InstructionDataset(test_data, tokenizer)

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_works
)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_works
)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_works
)

In [None]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [88]:
torch.manual_seed(123) 
input_text = format_input(val_data[0]) 
print(input_text)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Rewrite this statement as an imperative sentence.

### Input:
You should finish your assignment.


In [89]:
model.eval()
inputs_id = tokenizer.encode(input_text)
inputs_tensor = torch.tensor(inputs_id).unsqueeze(0)
with torch.no_grad():
    output = model.generate(
        inputs_tensor,
        max_length = 60,
        num_return_sequences=1
    )
generated_ids = output[0].tolist()
generated_text = tokenizer.decode(generated_ids)

num_input_ids = len(inputs_id)
response_text = generated_text[len(input_text):].strip()
print(response_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


### Output:

You should complete your assignment.


In [None]:
def calc_loss_batch(inputs, targets, model, device):
    inputs = inputs.to(device)
    targets = targets.to(device)

    model.train() 
    outputs = model(inputs, labels=targets)

    # 获取损失
    loss = outputs.loss
    return loss  # 返回张量

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (inputs, targets) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(inputs, targets, model, device)
            total_loss += loss
        else:
            break
    return total_loss / num_batches

In [113]:
model.to(device) 
torch.manual_seed(123) 
with torch.no_grad(): 
    train_loss = calc_loss_loader( 
        train_loader, model, device,num_batches=5
        ) 
    val_loss = calc_loss_loader( 
        val_loader, model, device, num_batches=5
        ) 
print("Training loss:", train_loss) 
print("Validation loss:", val_loss)

Training loss: tensor(262.3800)
Validation loss: tensor(274.9545)


In [101]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, 
                                      num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, 
                                      num_batches=eval_iter)
        model.train()
        return train_loss, val_loss

In [102]:
def generate_text_simple(model, idx, max_text, context_size):
    for _ in range(max_text):
        idx = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx)
        logits = logits[:, -1, :]
        idx_text = torch.softmax(logits, dim = -1)
        idx_next = torch.argmax(idx_text, dim = -1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim = -1)
    return idx

In [103]:
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_embed.weight.shape[0]

    inputs_id = tokenizer.encode(start_context)
    encoded = torch.tensor(inputs_id).unsqueeze(0).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model = model, idx = encoded,
            max_text = 50, context_size = context_size
        )

    generated_id = output[0].tolist()
    decoded_text = tokenizer.decode(generated_id)
    print(decoded_text.replace("\n", " ")) 
    model.train()

In [104]:
def generate_and_print(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.config.n_positions

    inputs_id = tokenizer.encode(start_context)
    inputs_tensor = torch.tensor(inputs_id).unsqueeze(0).to(device)
    if inputs_tensor.shape[1] > context_size:
        inputs_tensor = inputs_tensor[:, :-context_size]

    max_length = context_size + 50
    with torch.no_grad():
        output = model.generate(inputs_tensor, max_length=max_length, num_return_sequences=1)
    generated_id = output[0].tolist()
    generate_text = tokenizer.decode(generated_id)
    
    response_text = generate_text[len(input_text):].strip()
    print(response_text)

In [105]:
def train(model, train_loader, val_loader, optimizer, device, 
          num_epochs, eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_token_seen = [], [], []
    token_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()
        model.to(device)
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(inputs, targets, model, device)
            loss.backward()
            optimizer.step()
            token_seen += inputs.numel()
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                    )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_token_seen.append(token_seen)

                print(f"Ep {epoch+1} (Step {global_step:06d}): " 
                      f"Train loss {train_loss:.3f}, " 
                      f"Val loss {val_loss:.3f}" 
                      )
        generate_and_print_sample(model, tokenizer, device, start_context)
    return train_losses, val_losses, track_token_seen

In [112]:
start_time = time.time() 
torch.manual_seed(123) 
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)
num_epochs = 2 

train_losses, val_losses, tokens_seen = train(
    model, train_loader, val_loader, optimizer, device, 
    num_epochs=num_epochs, eval_freq=5, eval_iter=5, 
    start_context=format_input(val_data[0]), tokenizer=tokenizer) 

end_time = time.time() 
execution_time_minutes = (end_time - start_time) / 60 
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 290.605, Val loss 291.868
Ep 1 (Step 000005): Train loss 293.196, Val loss 291.232
Ep 1 (Step 000010): Train loss 295.004, Val loss 290.369
Ep 1 (Step 000015): Train loss 291.270, Val loss 289.461
Ep 1 (Step 000020): Train loss 285.863, Val loss 288.645
Ep 1 (Step 000025): Train loss 289.004, Val loss 287.579
Ep 1 (Step 000030): Train loss 288.944, Val loss 286.971
Ep 1 (Step 000035): Train loss 286.676, Val loss 286.135
Ep 1 (Step 000040): Train loss 285.916, Val loss 285.258
Ep 1 (Step 000045): Train loss 289.763, Val loss 284.535
Ep 1 (Step 000050): Train loss 275.644, Val loss 283.787
Ep 1 (Step 000055): Train loss 295.062, Val loss 282.968
Ep 1 (Step 000060): Train loss 282.692, Val loss 282.200
Ep 1 (Step 000065): Train loss 279.132, Val loss 280.997
Ep 1 (Step 000070): Train loss 287.079, Val loss 280.205
Ep 1 (Step 000075): Train loss 282.901, Val loss 279.452
Ep 1 (Step 000080): Train loss 285.635, Val loss 278.762
Ep 1 (Step 000085): Train loss 

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

def plot_values(epochs_seen, examples_seen, train_values, val_values, 
    label="loss"): 
    fig, ax1 = plt.subplots(figsize=(5, 3))
    ax1.plot(epochs_seen, train_values, label=f"Training {label}") 
    ax1.plot( epochs_seen, val_values, linestyle="-.", label=f"Validation {label}") 
    ax1.set_xlabel("Epochs") 
    ax1.set_ylabel(label.capitalize()) 
    ax1.legend() 
    ax2 = ax1.twiny() 
    ax2.plot(examples_seen, train_values, alpha=0) 
    ax2.set_xlabel("Examples seen") 
    fig.tight_layout() 
    plt.savefig(f"{label}-plot.pdf") 
    plt.show()
epochs_tensor = torch.linspace(0, num_epochs, len(train_losses)) 
examples_seen_tensor = torch.linspace(0, example_seen, len(train_losses)) 
plot_values(epochs_tensor, examples_seen_tensor, train_losses, val_losses)

NameError: name 'example_seen' is not defined

In [None]:
def generate(model, max_new_length):
    model.eval()
    inputs_id = tokenizer.encode(input_text)
    inputs_tensor = torch.tensor(inputs_id).unsqueeze(0)
    with torch.no_grad():
        output = model.generate(
            inputs_tensor,
            max_length = 60,
            num_return_sequences=1
        )
    generated_ids = output[0].tolist()
    generated_text = tokenizer.decode(generated_ids)

    response_text = generated_text[len(input_text):].strip()
    return response_text