# Chapter 4,5 of course book

In [63]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import json
import stanza
from utills import MultiHeadAttention, GELU 

In [64]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Success: Using Apple M2 GPU Acceleration")
else:
    device = torch.device("cpu")
    print("Using CPU (Slow)")

Success: Using Apple M2 GPU Acceleration


Load text

In [66]:
with open("torah.txt", "r", encoding="utf-8") as f:
    text = f.read()

tokens_text = json.load(open("output/text_stanza.json", "r", encoding="utf-8"))

vocab = sorted(set(tokens_text))
VOCAB_SIZE = len(vocab)
print(f"Vocabulary size: {VOCAB_SIZE}")
word2idx = {token: idx for idx, token in enumerate(vocab)}
idx2word = {idx: token for idx, token in enumerate(vocab)}

Vocabulary size: 9109


Load emmbeddings

In [67]:
emb_tensor = None
state = torch.load("output/word2vec_model_stanza_short.pth", map_location=device)
if isinstance(state, dict):
    for key in (
        "in_embed.weight",
        "in_embed.weight",
        "out_embed.weight",
        "embeddings.weight",
        "embedding.weight",
        "encoder.weight",
    ):
        if (
            key in state
            and isinstance(state[key], torch.Tensor)
            and state[key].dim() == 2
        ):
            if state[key].size(0) == VOCAB_SIZE:
                emb_tensor = state[key]
                break

    if emb_tensor is None:
        for k, v in state.items():
            if (
                isinstance(v, torch.Tensor)
                and v.dim() == 2
                and v.size(0) == vocab_size
            ):
                emb_tensor = v
                break

if emb_tensor is None:
    raise RuntimeError(
        "Couldn't find a 2D embedding tensor matching VOCAB_SIZE in the saved state_dict."
    )
emb_tensor = emb_tensor.to(device)
print(f"Loaded embedding tensor of shape: {emb_tensor.shape}")

Loaded embedding tensor of shape: torch.Size([9109, 100])


In [68]:
GPT_CONFIG = {
    "vocab_size": VOCAB_SIZE,  # Vocabulary size
    "context_length": 256,  # Context length
    "emb_dim": emb_tensor.shape[1],  # Embedding dimension
    "n_heads": 10,  # Number of attention heads
    "n_layers": 12,  # Number of layers
    "drop_rate": 0.1,  # Dropout rate
    "qkv_bias": False,  # Query-Key-Value bias
}

In [69]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

In [70]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"],
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x

In [71]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

create the model

In [72]:
model = GPTModel(GPT_CONFIG).to(device)

generating text

In [73]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):
        
        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]
        
        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)
        
        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]  

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

create stanza tokenizer

In [74]:
nlp = stanza.Pipeline(
    lang="he", processors="tokenize,mwt", device=device  
)

2026-01-28 12:33:33 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 436kB [00:00, 11.3MB/s]                    
2026-01-28 12:33:33 INFO: Downloaded file to /Users/erannovak/stanza_resources/resources.json
2026-01-28 12:33:33 INFO: Loading these models for language: he (Hebrew):
| Processor | Package           |
---------------------------------
| tokenize  | combined_nocharlm |
| mwt       | combined          |

2026-01-28 12:33:33 INFO: Using device: mps
2026-01-28 12:33:33 INFO: Loading: tokenize
2026-01-28 12:33:33 INFO: Loading: mwt
2026-01-28 12:33:33 INFO: Done loading processors!


In [75]:
class My_Stanza_Tokenizer:
    def __init__(self, nlp=nlp):
        self.nlp = nlp

    def tokenize(self, input_text, word2idx=word2idx):
        doc = self.nlp(input_text)
        tokens = [word.text for sent in doc.sentences for word in sent.words]
        return [word2idx[token] for token in tokens if token in word2idx]

    def encode(self, input_tokens, word2idx=word2idx):
        # return a 1-D list of token ids (no batch dim)
        if isinstance(input_tokens, str):
            doc = self.nlp(input_tokens)
            tokens = [word.text for sent in doc.sentences for word in sent.words]
            return [word2idx[t] for t in tokens if t in word2idx]
        if isinstance(input_tokens, (list, tuple)):
            if len(input_tokens) == 0:
                return []
            # if already ints (token ids), return as list
            if isinstance(input_tokens[0], int):
                return list(input_tokens)
            # otherwise treat as token strings
            return [word2idx[token] for token in input_tokens if token in word2idx]
        raise TypeError("encode() expects str or list/tuple")

    def decode(self, indices, idx2word=idx2word):
        if isinstance(indices, torch.Tensor):
            indices = indices.squeeze(0).tolist()
        decoded =  [idx2word[idx] for idx in indices if idx in idx2word]
        return [' '.join(i) for i in decoded] if isinstance(decoded[0], list) else ' '.join(decoded)

In [76]:
tokenizer = My_Stanza_Tokenizer(nlp)
start_context = "כי ידע כל לב איש"
encoded_context = tokenizer.tokenize(start_context, word2idx)
print(encoded_context)
print(tokenizer.decode(encoded_context, idx2word))

[4119, 3014, 4140, 4389, 292]
כי ידע כל לב איש


In [77]:
start_context = "כי ידע כל לב איש"
doc = nlp(start_context)
tokens = [word.text for sent in doc.sentences for word in sent.words]
print(tokens)

['כי', 'ידע', 'כל', 'לב', 'איש']


In [78]:
encoded = [word2idx[token] for token in tokens if token in word2idx]
print(encoded)
encoded_tensor = torch.tensor(encoded, dtype=torch.long, device=device).unsqueeze(0)


[4119, 3014, 4140, 4389, 292]


In [79]:
model.eval() # disable dropout

out = generate_text_simple(
    model=model,
    idx=encoded_tensor, 
    max_new_tokens=6, 
    context_size=GPT_CONFIG["context_length"]
)

print("Output:", out)
print("Output length:", len(out[0]))

Output: tensor([[4119, 3014, 4140, 4389,  292, 7283,  342, 5170, 3065, 2453, 7639]],
       device='mps:0')
Output length: 11


In [80]:
print("Generated tokens:", [idx2word[idx.item()] for idx in out[0]])

Generated tokens: ['כי', 'ידע', 'כל', 'לב', 'איש', 'צדיו', 'אלהם', 'מות', 'יובא', 'חדשי', 'קרכ']


Training text completion
Data set and data loader

In [47]:
TRAIN_RATIO = 0.9

In [81]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, word2idx)
        assert (
            len(token_ids) > max_length
        ), "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]



In [86]:
def create_dataloader_v1(
    txt,
    batch_size=4,
    max_length=256,
    stride=128,
    shuffle=True,
    drop_last=True,
    num_workers=0,
):

    tokenizer = My_Stanza_Tokenizer(nlp)

    # If caller passed already-tokenized list of tokens (strings) or list of ids, handle it:
    if isinstance(txt, (list, tuple)):
        # if list of token strings, convert to ids; if list of ints, keep
        if len(txt) == 0:
            raise ValueError("Empty input 'txt'")
        if isinstance(txt[0], int):
            token_ids = list(txt)
        else:
            token_ids = tokenizer.encode(txt, word2idx)
    else:
        # txt is a single string
        token_ids = tokenizer.encode(txt, word2idx)

    if len(token_ids) <= max_length:
        raise ValueError(f"token_ids too short for max_length={max_length} (len={len(token_ids)})")

    # pass the token id list into the dataset (GPTDatasetV1 calls tokenizer.encode internally,
    # but now we can pass token ids directly by converting dataset to accept lists or by
    # creating a small wrapper — simplest: pass the original token list and let GPTDatasetV1 call encode)
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
    )

    return dataloader

In [83]:
# token and id's functions
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text)
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension
    return encoded_tensor


def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)  # remove batch dimension
    return tokenizer.decode(flat.tolist())

training and validation data

In [84]:
train_ratio = TRAIN_RATIO
split_idx = int(len(tokens_text) * train_ratio)
train_data = tokens_text[:split_idx]
val_data = tokens_text[split_idx:]
print(f"Train data size: {len(train_data)} tokens")
print("train data sample:", train_data[:5])
print(f"Validation data size: {len(val_data)} tokens")

Train data size: 108688 tokens
train data sample: ['ב', 'ראשית', 'ברא', 'אלהים', 'את']
Validation data size: 12077 tokens


In [87]:
train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG["context_length"],
    stride=GPT_CONFIG["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0,
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG["context_length"],
    stride=GPT_CONFIG["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0,
)
# print("Train loader:")
# for x, y in train_loader:
#     print(x.shape, y.shape)

# print("\nValidation loader:")
# for x, y in val_loader:
#     print(x.shape, y.shape)

Loss functions

In [88]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
        logits.flatten(0, 1), target_batch.flatten()
    )
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches


test loss functions

In [89]:
with torch.no_grad():  # Disable gradient tracking for efficiency because we are not training, yet
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 9.30222293565858
Validation loss: 9.294546683629354


training loop

helper function

In [90]:
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded, max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(
            train_loader, model, device, num_batches=eval_iter
        )
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

In [91]:
def train_model_simple(
    model,
    train_loader,
    val_loader,
    optimizer,
    device,
    num_epochs,
    eval_freq,
    eval_iter,
    start_context,
    tokenizer,
):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()  # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()  # Calculate loss gradients
            optimizer.step()  # Update model weights using loss gradients
            tokens_seen += input_batch.numel()
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(
                    f"Ep {epoch+1} (Step {global_step:06d}): "
                    f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}"
                )

        # Print a sample text after each epoch
        generate_and_print_sample(model, tokenizer, device, start_context)

    return train_losses, val_losses, track_tokens_seen
    

training loop

In [92]:
torch.manual_seed(123)
start_text = "וילכו בני ישראל אל המקום"
doc = nlp(start_text)
tokens = [word.text for sent in doc.sentences for word in sent.words]
encoded = [word2idx[token] for token in tokens if token in word2idx]
start_context = torch.tensor(encoded, dtype=torch.long, device=device).unsqueeze(0)
print(start_context)

tensor([[2223,  948, 3937,  330, 1445, 5720]], device='mps:0')


In [93]:

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
tokenizer = My_Stanza_Tokenizer(nlp)
num_epochs = 50

train_losses, val_losses, tokens_seen = train_model_simple(
    model,
    train_loader,
    val_loader,
    optimizer,
    device,
    num_epochs=num_epochs,
    eval_freq=5,
    eval_iter=5,
    start_context= "וילכו בני ישראל אל המקום",
    tokenizer=tokenizer,
)

Ep 1 (Step 000000): Train loss 9.110, Val loss 9.065
Ep 1 (Step 000005): Train loss 8.421, Val loss 8.470
Ep 1 (Step 000010): Train loss 8.225, Val loss 8.164
Ep 1 (Step 000015): Train loss 7.921, Val loss 7.941
Ep 1 (Step 000020): Train loss 7.678, Val loss 7.740
Ep 1 (Step 000025): Train loss 7.559, Val loss 7.553
Ep 1 (Step 000030): Train loss 7.235, Val loss 7.365
Ep 1 (Step 000035): Train loss 7.157, Val loss 7.177
Ep 1 (Step 000040): Train loss 6.921, Val loss 7.009
Ep 1 (Step 000045): Train loss 6.779, Val loss 6.861
Ep 1 (Step 000050): Train loss 6.751, Val loss 6.732
Ep 1 (Step 000055): Train loss 6.552, Val loss 6.623
Ep 1 (Step 000060): Train loss 6.399, Val loss 6.513
Ep 1 (Step 000065): Train loss 6.380, Val loss 6.432
Ep 1 (Step 000070): Train loss 6.230, Val loss 6.361
Ep 1 (Step 000075): Train loss 6.114, Val loss 6.296
Ep 1 (Step 000080): Train loss 6.219, Val loss 6.239
Ep 1 (Step 000085): Train loss 6.157, Val loss 6.193
Ep 1 (Step 000090): Train loss 6.177, Val loss