In [1]:
import json
import os
import urllib
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken
from functools import partial
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.nn import functional as F
import time
import tqdm

ValueError: Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

### 下载数据集

In [77]:
def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as respons:
            text_data = respons.read().decode("utf-8")
        with open (file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else:
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

file_path = "../dataset/instruction-data.json" 
url = ( 
 "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch" 
 "/main/ch07/01_main-chapter-code/instruction-data.json" 
) 
data = download_and_load_file(file_path, url) 
print("Number of entries:", len(data))

Number of entries: 1100


In [78]:
print("Example entry:\n", data[99])

Example entry:
 {'instruction': "Provide a synonym for 'kind'.", 'input': '', 'output': "A synonym for 'kind' is 'benevolent'."}


### 提示词风格

#### 将指令集中的数据转换为两种提示词风格，Alpac和Phi-3

In [79]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request." 
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    input_text = (
        f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    )
    return instruction_text + input_text

### 划分数据集

In [80]:
train_data, temp_data = train_test_split(data, train_size=0.85, random_state=42)
test_data, val_data = train_test_split(temp_data, test_size=0.33, random_state=42)

print(f"训练集: {len(train_data)}")
print(f"验证集: {len(val_data)}")
print(f"测试集: {len(test_data)}")

训练集: 935
验证集: 55
测试集: 110


#### dataset与dataloader

In [81]:
class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_text = []
        for entry in data:
            instruction_inputs = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_inputs + response_text
            self.encoded_text.append(tokenizer.encode(full_text))
    def __getitem__(self, index):
        return self.encoded_text[index]
    def __len__(self):
        return len(self.data)

In [82]:
def custom_collate_fn(batch, pad_token=50256, ignore_token=-100, allowed_max_length=None,device="cpu"):
    batch_max_lemgth = max(len(item) + 1 for item in batch)
    input_lst, target_lst = [], []

    for item in batch:
        new_item = item.copy()
        new_item += [pad_token]
        padded = (new_item + [pad_token] * (batch_max_lemgth - len(new_item)))
        inputs = padded[:-1]
        targets = padded[1:]

        inputs = torch.tensor(inputs, dtype=torch.long, device=device)
        targets = torch.tensor(targets, dtype=torch.long, device=device)
        mask = targets == pad_token
        indices = torch.nonzero(mask).squeeze() 
        if indices.numel() > 1: 
            targets[indices[1:]] = ignore_token
        
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]
        input_lst.append(inputs)
        target_lst.append(targets)
    
    input_tensor = torch.stack(input_lst).to(device)
    target_tensor = torch.stack(target_lst).to(device)
    return input_tensor, target_tensor

In [83]:
inputs_1 = [0, 1, 2, 3, 4] 
inputs_2 = [5, 6] 
inputs_3 = [7, 8, 9] 
batch = ( 
 inputs_1, 
 inputs_2, 
 inputs_3 
) 
print(custom_collate_fn(batch))

(tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]]), tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]]))


In [84]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
collate_fn = partial(
    custom_collate_fn,
    device=device,
    allowed_max_length=1024
)

In [85]:
num_works = 0
batch_size = 8
tokenizer = tiktoken.get_encoding("gpt2")
torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
val_dataset = InstructionDataset(val_data, tokenizer)
test_dataset = InstructionDataset(test_data, tokenizer)

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_works
)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_works
)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_works
)

In [None]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
def train_one_epoch(model, train_loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(train_loader)