### Full finished code, for reference

You may want to refer directly to the git repo instead though.

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4

grad_clip = 1.0
best_val_loss = float('inf')
patience_counter = 0
patience = 5

eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.3
# ------------

In [3]:
torch.manual_seed(1337)

with open('kinyas_kayra_clean.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
import torch
from collections import Counter

class ByteLevelBPE:
    def __init__(self, text, num_merges=500):
        self.text = text
        self.num_merges = num_merges
        self.vocab = None
        self.merges = []
        self.token_to_id = {}
        self.id_to_token = {}
        self._learn_bpe()
        self._build_token_vocab()

    def _get_vocab(self):
        vocab = Counter()
        words = self.text.strip().split()

        for word in words:
            word_bytes = list(word.encode('utf-8'))
            word_bytes_str = [f"{b:03d}" for b in word_bytes]
            tokenized = ' '.join(word_bytes_str + ['</w>'])
            vocab[tokenized] += 1

        return vocab

    def _get_stats(self, vocab):
        pairs = Counter()
        for word, freq in vocab.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                if symbols[i+1] == '</w>':
                    continue
                pairs[(symbols[i], symbols[i+1])] += freq
        return pairs

    def _merge_vocab(self, pair, vocab_in):
        vocab_out = {}
        replacement = pair[0] + pair[1]

        for word, freq in vocab_in.items():
            symbols = word.split()
            new_symbols = []
            i = 0

            while i < len(symbols):
                if i < len(symbols) - 1 and (symbols[i], symbols[i+1]) == pair:
                    new_symbols.append(replacement)
                    i += 2
                else:
                    new_symbols.append(symbols[i])
                    i += 1

            new_word = ' '.join(new_symbols)
            vocab_out[new_word] = freq

        return vocab_out

    def _learn_bpe(self):
        self.vocab = self._get_vocab()
        for i in range(self.num_merges):
            pairs = self._get_stats(self.vocab)
            if not pairs:
                break
            best = max(pairs, key=pairs.get)
            self.vocab = self._merge_vocab(best, self.vocab)
            self.merges.append(best)
            if i % 100 == 0 or i == self.num_merges - 1:
                print(f"Merge {i+1}: {best}")

        self.merges_set = set(self.merges)

    def _build_token_vocab(self):
        # BPE tokenları: başlangıçta tüm byte kodları + merged tokenlar
        tokens = set()
        # Tüm kelimeler
        for word in self.vocab.keys():
            for token in word.split():
                tokens.add(token)
        # Ayrıca merge'lerle oluşan tokenlar
        for a,b in self.merges:
            tokens.add(a+b)
        tokens.discard('</w>')  # </w> genelde tokenize edilmez veya farklı işlenir
        tokens = sorted(list(tokens))
        self.token_to_id = {tok: idx for idx, tok in enumerate(tokens)}
        self.id_to_token = {idx: tok for tok, idx in self.token_to_id.items()}

    def encode(self, word):
        word_bytes = [f"{b:03d}" for b in word.encode('utf-8')] + ['</w>']

        while True:
            pairs = [(word_bytes[i], word_bytes[i+1]) for i in range(len(word_bytes)-1)]
            mergeable = [p for p in pairs if p in self.merges]

            if not mergeable:
                break

            best = None
            for merge in self.merges:
                if merge in pairs:
                    best = merge
                    break

            if best is None:
                break

            new_word = []
            i = 0
            while i < len(word_bytes):
                if i < len(word_bytes) - 1 and (word_bytes[i], word_bytes[i+1]) == best:
                    new_word.append(word_bytes[i] + word_bytes[i+1])
                    i += 2
                else:
                    new_word.append(word_bytes[i])
                    i += 1

            word_bytes = new_word

        encoded_ids = []
        for token in word_bytes:
            if token == '</w>':
                continue
            encoded_ids.append(self.token_to_id[token])
        return encoded_ids

    def decode(self, token_ids):
        tokens = [self.id_to_token[id_] for id_ in token_ids]
        byte_sequence = []
        for token in tokens:
            for i in range(0, len(token), 3):
                byte_sequence.append(int(token[i:i+3]))
        return bytes(byte_sequence).decode('utf-8', errors='replace')

bpe = ByteLevelBPE(text, num_merges=4000)

word = "Kinyas"
encoded = bpe.encode(word)
print("Encoded:", encoded)

decoded = bpe.decode(encoded)
print("Decoded:", decoded)



# here are all the unique characters that occur in this text
#chars = sorted(list(set(text)))
#vocab_size = len(chars)
# create a mapping from characters to integers
#stoi = { ch:i for i,ch in enumerate(chars) }
#itos = { i:ch for i,ch in enumerate(chars) }
#encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
#decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

Merge 1: ('196', '177')
Merge 101: ('197159', '101')
Merge 201: ('105', '104')
Merge 301: ('075', '105110121097')
Merge 401: ('097114', '107')
Merge 501: ('116', '097114097')
Merge 601: ('108097', '121097')
Merge 701: ('111108109097', '108196177')
Merge 801: ('196176', '107105')
Merge 901: ('105110', '099105')
Merge 1001: ('097110', '110101')
Merge 1101: ('107097114', '196177')
Merge 1201: ('103', '195188108')
Merge 1301: ('117196159', '114097')
Merge 1401: ('107097', '102')
Merge 1501: ('100097110', '046')
Merge 1601: ('112', '105122')
Merge 1701: ('098097', '122196177')
Merge 1801: ('076', '111')
Merge 1901: ('197159', '097110')
Merge 2001: ('097', '105116')
Merge 2101: ('098101110122101', '121101110')
Merge 2201: ('101116', '116105109')
Merge 2301: ('116105', '116114101')
Merge 2401: ('100195188110121097', '121097')
Merge 2501: ('111108117114', '100117')
Merge 2601: ('107097114197159196177', '108196177196159196177110100097')
Merge 2701: ('107097108196177', '114')
Merge 2801: ('10011

In [6]:
from tqdm import tqdm

def encode_text_with_bpe_ids(bpe_obj, text):
    tokens = []
    words = text.strip().split()
    for word in tqdm(words, desc="Encoding with BPE"):
        tokens.extend(bpe_obj.encode(word))
    return tokens


tokens = encode_text_with_bpe_ids(bpe, text)
data = torch.tensor(tokens, dtype=torch.long)

n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

print(f"Total token: {len(data)}")
print(f"Train data size: {len(train_data)}")
print(f"Val data size: {len(val_data)}")

# Train and test splits
#data = torch.tensor(encode(text), dtype=torch.long)
#n = int(0.9*len(data)) # first 90% will be train, rest val
#train_data = data[:n]
#val_data = data[n:]

Encoding with BPE: 100%|██████████| 143007/143007 [07:01<00:00, 339.16it/s]


Total token: 247710
Train data size: 222939
Val data size: 24771


In [7]:
# data loading
def get_batch(split):
    data_split = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    x = torch.stack([data_split[i:i+block_size] for i in ix])
    y = torch.stack([data_split[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

In [8]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [9]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [10]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [11]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [12]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


In [13]:
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]  # (B, vocab_size)

            logits = logits / temperature

            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                min_v = v[:, -1].unsqueeze(1)
                logits = torch.where(logits < min_v, torch.full_like(logits, -float('Inf')), logits)

            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [14]:
vocab_size = len(bpe.token_to_id)
vocab_size

4071

In [15]:
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-2)

13.869543 M parameters


In [17]:
import torch
import math
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir="runs/bpe_transformer")

def get_lr(it, warmup_iters=500, max_lr=1e-3, total_iters=5000):
    if it < warmup_iters:
        return max_lr * it / warmup_iters
    elif it > total_iters:
        return 0.0
    else:
        decay_ratio = (it - warmup_iters) / (total_iters - warmup_iters)
        return max_lr * 0.5 * (1.0 + math.cos(math.pi * decay_ratio))

for iter in range(max_iters):
    # Learning rate scheduler
    lr = get_lr(iter)
    for g in optimizer.param_groups:
        g['lr'] = lr

    # Değerlendirme ve log
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        train_loss = losses['train']
        val_loss = losses['val']

        print(f"Step {iter}: Train {train_loss:.4f}, Val {val_loss:.4f}, LR {lr:.6f}")
        writer.add_scalar("Loss/train", train_loss, iter)
        writer.add_scalar("Loss/val", val_loss, iter)
        writer.add_scalar("Learning Rate", lr, iter)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pt')
            print("✨ Best model saved.")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("🛑 Early stopping.")
                break

    # Eğitim adımı
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()


#for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
#    if iter % eval_interval == 0 or iter == max_iters - 1:
#        losses = estimate_loss()
#        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
#    xb, yb = get_batch('train')

    # evaluate the loss
#    logits, loss = model(xb, yb)
#    optimizer.zero_grad(set_to_none=True)
#    loss.backward()
#    optimizer.step()


Step 0: Train 8.4729, Val 8.4705, LR 0.000000
✨ Best model saved.
Step 500: Train 4.5793, Val 6.0280, LR 0.001000
✨ Best model saved.
Step 1000: Train 2.2509, Val 7.1576, LR 0.000970
Step 1500: Train 0.6333, Val 8.8683, LR 0.000883
Step 2000: Train 0.2566, Val 10.0710, LR 0.000750
Step 2500: Train 0.1766, Val 10.8323, LR 0.000587
Step 3000: Train 0.1460, Val 11.3240, LR 0.000413
🛑 Early stopping.


In [20]:
prompt = "fahişe"
prompt_tokens = []
for w in prompt.strip().split():
    prompt_tokens.extend(bpe.encode(w))
context = torch.tensor(prompt_tokens, dtype=torch.long, device=device).unsqueeze(0)

generated_ids = model.generate(context, max_new_tokens=100, temperature=0.7, top_k=50)[0].tolist()
print("Generated text:")
print(bpe.decode(generated_ids))

# generate from the model
#context = torch.zeros((1, 1), dtype=torch.long, device=device)
#print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

Generated text:
fahişesadeceannemleilgilihiççalıştıklarımayarakkeskinleştiricisohbetkonulardandahasonrayanonuniçinekapanmıştı.Vehemenhemensonrabenigörüncedışarıçıkıncaerkeyanonunüçüncüdeyavaşçasurazlakkafasıçenesinegöçayolüklerindenbirşekildetormakyajçantavardı.Veadamyanımızadahayenibirdomatendi.Tırınnefretedilmemiş,TamamBelçikalıvardı.Veoda,beyazharflerlebirdagözündecan.YaKinyasadamın
