<hr>

本notebook实现从0到1构建自己的大模型

#Setup Environment
安装和导入必要的包

In [None]:

!pip install --upgrade --quiet tiktoken

In [2]:
# 导入包
import numpy as np
import torch 
from torch.utils.data import Dataset, DataLoader
from torch import nn 
import tiktoken as ttk

from tqdm import tqdm
import os
import urllib.request
from dataclasses import dataclass, field,asdict, replace

from typing import Dict

In [3]:
# 从仓库获得数据（raw text,需要进一步处理才能得到模型可处理的数据形式）
if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [4]:
@dataclass
class GPTConfig:
    vocab_size: int = 50257
    context_length: int = 512
    emb_dim: int = 768
    n_heads: int = 12
    n_layers: int = 12
    drop_rate: float = 0.1
    qkv_bias: bool = False

    def to_dict(self) -> dict:
        return asdict(self)

    def __repr__(self) -> str:
        config_dict = self.to_dict()
        formatted_items = [f'"{key}": {repr(value)}' for key, value in config_dict.items()]
        return "GPT_CONFIG_124M = {\n    " + ",\n    ".join(formatted_items) + "\n}"

@dataclass
class DataConfig:
    dataPath: str =r'/kaggle/working/the-verdict.txt'
    max_length: int = GPTConfig.context_length
    batch_size: int = 64
    train_ratio : float = 0.90
    stride: int = GPTConfig.context_length
#     def __post_init__(self):
#         self.stride = self.max_length // 2
DataConfig =DataConfig()
GPTConfig=GPTConfig()
device =  torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
def read_txt(path: str) -> str:
    try:
        with open(path, "r", encoding="utf-8") as f:
            raw_text = f.read()
        return raw_text
    except FileNotFoundError:
        print(f"Error: File not found at {path}")
        return ""
    except Exception as e:
        print(f"An error occurred: {e}")
        return ""

def text_to_token_ids(text, tokenizer): #text--》token id映射
    encoded = tokenizer.encode(text)
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension
    return encoded_tensor


def token_ids_to_text(token_ids, tokenizer): #token id-》text解码
    flat = token_ids.squeeze(0)  # remove batch dimension
    return tokenizer.decode(flat.tolist())

大模型处理流程：数据--》tokenizer--》模型--post-processing--》输出
第一步：数据处理：分词（token）--》token id---》embedding--》model processing

In [6]:

class LLMDataset(Dataset):
    """
   模拟gpt这种decoder-only架构的模型，生成式任务。所以自定义数据集类来处理文本数据，将他们转化为输入和输出序列用来语言建模

    Args:
        txt (str): The input text to be tokenized and processed.
        tokenizer (Tokenizer): The tokenizer to be used for encoding the text.
        max_length (int): The maximum length of each input sequence.
        stride (int): The number of tokens to skip between sequences.
    """

    def __init__(self, txt, tokenizer, max_length: int, stride: int):
        self.tokenizer = tokenizer
        token_ids = tokenizer.encode(txt)
        self.input_ids = []
        self.target_ids = []
        
        for i in tqdm(range(0, len(token_ids) - max_length, stride)):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1:i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        """
        返回数据集样本数目
        """
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def LLM_DataLoader(txt, tokenizer, batch_size: int, max_length: int, stride: int,
                   shuffle: bool = True, drop_last: bool = True):
    """
    加载LLMDataset

    Args:
        txt (str): The input text to be tokenized and processed.
        tokenizer (Tokenizer): The tokenizer to be used for encoding the text.
        batch_size (int): The number of samples per batch to load.
        max_length (int): The maximum length of each input sequence.
        stride (int): The number of tokens to skip between sequences.
        shuffle (bool, optional): Whether to shuffle the data at every epoch. Defaults to True.
        drop_last (bool, optional): Whether to drop the last incomplete batch. Defaults to True.

    """
    llmdataset = LLMDataset(txt, tokenizer, max_length, stride)
    llmdataloader = DataLoader(llmdataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    return llmdataloader


In [None]:
# 加载数据集，tokenizer化测试
raw_data = read_txt(DataConfig.dataPath)
tokenizer = ttk.get_encoding("gpt2") 

total_token = len(tokenizer.encode(raw_data))
print(f"-> Number of Characters : {len(raw_data)}\n-> Number of Tokens : {total_token}")

In [None]:
# 将数据集split成训练集，测试集
train_ratio = DataConfig.train_ratio
split_idxs = int(train_ratio * len(raw_data))
train_data = raw_data[:split_idxs]
val_data = raw_data[split_idxs:]
print(f'-> Length of training data : {len(train_data)}\n-> Length of val_data : {len(val_data)}')


# Sanity check
if total_token * (train_ratio) < GPTConfig.context_length:
    print("Not enough tokens for the training loader. "
          "Try to lower the `GPTConfig.context_length or "
          "increase the `training_ratio`")

if total_token * (1-train_ratio) < GPTConfig.context_length:
    print("Not enough tokens for the validation loader. "
          "Try to lower the `GPTConfig.context_length` or "
          "decrease the `training_ratio`")

In [None]:
# 处理成LLM可接受的输入
train_dataloader = LLM_DataLoader(
  txt= train_data,
  tokenizer = tokenizer,
  max_length = DataConfig.max_length,
  batch_size =  DataConfig.batch_size,
  stride =  DataConfig.stride,
  shuffle=False,
  drop_last = False
)

#测试
print("View example:")
dataiter = iter(train_dataloader)
firstbatch =next(dataiter)
print(f'inputs: \n{firstbatch[0]} \ntarget: \n{firstbatch[1]}')
firstbatch[0].shape

In [None]:
val_dataloader = LLM_DataLoader(
  txt= val_data,
  tokenizer = tokenizer,
  max_length = DataConfig.max_length,
  batch_size =  DataConfig.batch_size,
  stride =  DataConfig.stride,
  shuffle=False,
  drop_last = False
)
#测试
dataiter = iter(val_dataloader)
firstbatch =next(dataiter)
firstbatch[0].shape

#可用来训练

#实现多头注意力机制
#关于为什么引入多头注意力机制，该机制作用等，可见大模型学习笔记

In [11]:
class MultiHeadAttention(nn.Module):
    """
    多头注意力模块
    
    Args:
        d_in (int): Input dimension.
        d_out (int): Output dimension.
        context_length (int): The length of the input sequence.
        dropout (float): Dropout probability.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional): Whether to include bias in query, key, and value projections. Default is False.
    
    Attributes:
        d_out (int): Output dimension.
        num_heads (int): Number of attention heads.
        head_dim (int): Dimension of each attention head.
        w_queries (nn.Linear): Linear projection for queries.
        w_keys (nn.Linear): Linear projection for keys.
        w_values (nn.Linear): Linear projection for values.
        out_proj (nn.Linear): Linear projection for output.
        dropout (nn.Dropout): Dropout layer.
        mask (torch.Tensor): Lower triangular mask to ensure causality.
    """
    def __init__(self, d_in: int, d_out: int, context_length: int,
                 dropout: float, num_heads: int, qkv_bias: bool = False):
        super(MultiHeadAttention, self).__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.w_queries = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.w_keys = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.w_values = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        
        self.register_buffer(
            'mask',
            torch.tril(torch.ones(context_length, context_length)).unsqueeze(0).unsqueeze(0)
        )

    def forward(self, x):
        batches, num_tokens, dim_in = x.shape

        # Linear projections
        queries = self.w_queries(x)
        keys = self.w_keys(x)
        values = self.w_values(x)

        # Reshape and transpose for multi-head attention
        queries = queries.view(batches, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        keys = keys.view(batches, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        values = values.view(batches, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)

        # Attention score calculation
        attn_scores = (queries @ keys.transpose(2, 3)) / (self.head_dim ** 0.5)

        # Apply mask: Broadcasting across batches and heads
        attn_scores = attn_scores.masked_fill(self.mask[:, :, :num_tokens, :num_tokens] == 0, float('-inf'))

        # Softmax to get attention weights
        attn_weights = torch.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Context vector computation
        context_vec = (attn_weights @ values).transpose(1, 2)
        context_vec = context_vec.contiguous().view(batches, num_tokens, self.d_out)

        # Final linear projection
        context_vec = self.out_proj(context_vec)
        
        return context_vec



#构建大语言模型LLM
#LLM decoder-only的模型架构有N层transformer架构组成。每层包含自注意力机制和前向传播层，层归一化层，残差连接（为了解决梯度消失问题）。

In [12]:
class LayerNorm(nn.Module):
    """
    层归一化模块
    
    Args:
        emb_dim (int): The dimension of the input embeddings.
    
    Attributes:
        eps (float): A small value to avoid division by zero.
        scale (nn.Parameter): Learnable scale parameter.
        shift (nn.Parameter): Learnable shift parameter.
    """
    def __init__(self, emb_dim):
        super(LayerNorm, self).__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


In [13]:
class GELU(nn.Module):
    """
    GELU激活函数
    GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x^3)))
    """
    def __init__(self):
        super(GELU, self).__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))
        ))

In [14]:
class FeedForwardGELU(nn.Module):
    """
    前向传播层
    
    Args:
        cfg (dict): Configuration dictionary with key 'emb_dim' representing the embedding dimension.
    
    该网络包括：
    1. 一个线性层，维度从embedding dimension--》4* embedding dimension;
    2. GELU()激活层
    3. 一个线性层，维度映射回 embedding dimension;
    """
    def __init__(self, cfg):
        super(FeedForwardGELU, self).__init__()
        emb_dim = cfg.emb_dim
        
        self.layers = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim),
            GELU(),
            nn.Linear(4 * emb_dim, emb_dim),
        )

    def forward(self, x):
        return self.layers(x)


构建transformer块的每个模块都已经具备，下面构建transformer架构

In [15]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg.emb_dim,
            d_out=cfg.emb_dim,
            context_length=cfg.context_length,
            num_heads=cfg.n_heads,
            dropout=cfg.drop_rate,
            qkv_bias=cfg.qkv_bias)
        self.ff = FeedForwardGELU(cfg)
        self.norm1 = LayerNorm(cfg.emb_dim)
        self.norm2 = LayerNorm(cfg.emb_dim)
        self.dropout = nn.Dropout(cfg.drop_rate)

    def forward(self, x):

        resid_conn = x
        x = self.norm1(x) 
        x = self.att(x)
        x = self.dropout(x)
        x = x + resid_conn

        resid_conn = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = x + resid_conn
        return x

In [16]:
# 构建最终的LLM架构
class GPTModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg.vocab_size,cfg.emb_dim) #token编码
        self.pos_emb = nn.Embedding(cfg.context_length,cfg.emb_dim) #位置编码
        self.transformer_blocks = nn.Sequential( *[TransformerBlock(cfg) for _ in range(cfg.n_layers)])
        self.final_norm = LayerNorm(cfg.emb_dim)
        self.out_ff = nn.Linear(cfg.emb_dim,cfg.vocab_size,bias = False)

    def forward(self, idx):
        batch_size, seq_len = idx.shape
        tok_embeds = self.tok_emb(idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=idx.device))

        x = tok_embeds + pos_embeds
        x = self.dropout_emb(x)
        x = self.transformer_blocks(x)
        x = self.final_norm(x)
        logits = self.out_ff(x)
        return logits


training阶段
在训练之前，要搞清楚：1. 如何用LLM来生成文本；2. 计算训练和验证集损失

In [17]:
#1. 生成文本
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):

        idx_cond = idx[:, -context_size:]

        # 获得预测
        with torch.no_grad():
            logits = model(idx_cond)

        # (batch, n_token, vocab_size) --》 (batch, vocab_size)
        logits = logits[:, -1, :]

        # 获得具有最高概率值的index
        idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch, 1)

        # 将具有最高概率值的token添加到现有序列
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx



def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx


def generate_and_print_sample(model, tokenizer, device, start_context,temperature, top_k, eos_id):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
#         token_ids = generate_text_simple(
#             model=model, idx=encoded,
#             max_new_tokens=50, context_size=context_size
#         )
         token_ids = generate(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size,
             temperature=temperature, top_k=top_k, eos_id=eos_id
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()
    


In [18]:
#2.计算损失
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [19]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer,
                      temperature, top_k, eos_id):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        optimizer.zero_grad()

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            tokens_seen += input_batch.numel()
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:03d}): "
                  f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Print a sample text after each epoch
        print("example: ")
        generate_and_print_sample(
            model, tokenizer, device, start_context,
            temperature, top_k, eos_id
        )
        print('-*-'*10)

    return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss



In [None]:
#训练模型
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            m.bias.data.fill_(0.01)


torch.manual_seed(123)
model = GPTModel(GPTConfig)
model.to(device)
model.apply(initialize_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00009, weight_decay=0.1)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

num_epochs = 50
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_dataloader, val_dataloader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="Every effort moves", tokenizer=tokenizer,
        top_k=10,temperature=0.4,eos_id=None
)

In [None]:
#模型验证
model.to("cpu")
model.eval()


token_ids = generate(
    model=model,
    idx=text_to_token_ids("quite insensible to the irony", tokenizer),
    max_new_tokens=25,
    context_size=GPTConfig.context_length,
    top_k=5,temperature=0.7,eos_id=None

)
#测试
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

<hr>

In [None]:
#保存效果最好的模型参数以便后面加载
print('saving model and optimizer...')
torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    }, 
    "model_and_optimizer.pth"
)
print('Done')

In [None]:
# 加载模型用来训练或者使用
print('loading...')
checkpoint = torch.load("model_and_optimizer.pth", weights_only=True)

model = GPTModel(GPTConfig)
model.to(device)
model.load_state_dict(checkpoint["model_state_dict"])

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.train();
print('Done')

In [None]:
num_epochs = 2
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_dataloader, val_dataloader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="Every effort moves ", tokenizer=tokenizer,
        top_k=10,temperature=2.7,eos_id=None
)

In [None]:
#通过transformers库加载已经训练好的模型架构和参数
from transformers import GPT2Model

gpt2_small=  "openai-community/gpt2"

gpt_hf = GPT2Model.from_pretrained(gpt2_small, cache_dir="checkpoints")
gpt_hf.eval()


In [None]:
copyConfig = replace(GPTConfig)
copyConfig.qkv_bias =True
copyConfig.context_length =  1024
copyConfig.drop_rate= 0.0       
copyConfig

In [29]:
def assign_check(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(right.clone().detach())

In [30]:
def load_weights(gpt, gpt_hf):

    d = gpt_hf.state_dict()

    gpt.pos_emb.weight = assign_check(gpt.pos_emb.weight, d["wpe.weight"])
    gpt.tok_emb.weight = assign_check(gpt.tok_emb.weight, d["wte.weight"])
    
    for b in range(copyConfig.n_layers):
        q_w, k_w, v_w = np.split(d[f"h.{b}.attn.c_attn.weight"], 3, axis=-1)
        gpt.transformer_blocks[b].att.w_queries.weight = assign_check(gpt.transformer_blocks[b].att.w_queries.weight, q_w.T)
        gpt.transformer_blocks[b].att.w_keys.weight = assign_check(gpt.transformer_blocks[b].att.w_keys.weight, k_w.T)
        gpt.transformer_blocks[b].att.w_values.weight = assign_check(gpt.transformer_blocks[b].att.w_values.weight, v_w.T)
    
        q_b, k_b, v_b = np.split(d[f"h.{b}.attn.c_attn.bias"], 3, axis=-1)
        gpt.transformer_blocks[b].att.w_queries.bias = assign_check(gpt.transformer_blocks[b].att.w_queries.bias, q_b)
        gpt.transformer_blocks[b].att.w_keys.bias = assign_check(gpt.transformer_blocks[b].att.w_keys.bias, k_b)
        gpt.transformer_blocks[b].att.w_values.bias = assign_check(gpt.transformer_blocks[b].att.w_values.bias, v_b)
    
    
        gpt.transformer_blocks[b].att.out_proj.weight = assign_check(gpt.transformer_blocks[b].att.out_proj.weight, d[f"h.{b}.attn.c_proj.weight"].T)
        gpt.transformer_blocks[b].att.out_proj.bias = assign_check(gpt.transformer_blocks[b].att.out_proj.bias, d[f"h.{b}.attn.c_proj.bias"])
    
        gpt.transformer_blocks[b].ff.layers[0].weight = assign_check(gpt.transformer_blocks[b].ff.layers[0].weight, d[f"h.{b}.mlp.c_fc.weight"].T)
        gpt.transformer_blocks[b].ff.layers[0].bias = assign_check(gpt.transformer_blocks[b].ff.layers[0].bias, d[f"h.{b}.mlp.c_fc.bias"])
        gpt.transformer_blocks[b].ff.layers[2].weight = assign_check(gpt.transformer_blocks[b].ff.layers[2].weight, d[f"h.{b}.mlp.c_proj.weight"].T)
        gpt.transformer_blocks[b].ff.layers[2].bias = assign_check(gpt.transformer_blocks[b].ff.layers[2].bias, d[f"h.{b}.mlp.c_proj.bias"])
    
        gpt.transformer_blocks[b].norm1.scale = assign_check(gpt.transformer_blocks[b].norm1.scale, d[f"h.{b}.ln_1.weight"])
        gpt.transformer_blocks[b].norm1.shift = assign_check(gpt.transformer_blocks[b].norm1.shift, d[f"h.{b}.ln_1.bias"])
        gpt.transformer_blocks[b].norm2.scale = assign_check(gpt.transformer_blocks[b].norm2.scale, d[f"h.{b}.ln_2.weight"])
        gpt.transformer_blocks[b].norm2.shift = assign_check(gpt.transformer_blocks[b].norm2.shift, d[f"h.{b}.ln_2.bias"])
    
        gpt.final_norm.scale = assign_check(gpt.final_norm.scale, d[f"ln_f.weight"])
        gpt.final_norm.shift = assign_check(gpt.final_norm.shift, d[f"ln_f.bias"])
        gpt.out_ff.weight = assign_check(gpt.out_ff.weight, d["wte.weight"])

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

gpt = GPTModel(copyConfig)
load_weights(gpt, gpt_hf)


In [None]:
# test
token_ids = generate(
    model=gpt.to(device),
    idx=text_to_token_ids("Every effort moves you", tokenizer).to(device),
    max_new_tokens=30,
    context_size=copyConfig.context_length,
    top_k=1,
    temperature=1.0
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))