In [1]:
import os
import torch
import torch.nn as nn
from torch.nn import functional as F
import json
import re
import random
from tqdm import tqdm
from collections import Counter, defaultdict
from datasets import load_dataset
import unicodedata
import math
from pathlib import Path

In [2]:
# Device configuration
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Set seed for reproducibility
torch.manual_seed(1337)

Using device: cuda


<torch._C.Generator at 0x7f3881ec09f0>

In [37]:
# Create checkpoint directories
os.makedirs("checkpoints", exist_ok=True)
os.makedirs("checkpoints/best_model", exist_ok=True)

# Data loading and preprocessing
def clean_text(text: str) -> str:
    """Türkçe metni temizle ve normalize et."""
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r'\[.*?\]|\(.*?\)', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def load_and_preprocess_data(max_samples=50000):
    """Veri setini yükleyip temizler."""
    dataset = load_dataset("musabg/wikipedia-tr-summarization", split='train')
    processed_texts = []
    
    for i in tqdm(range(min(len(dataset), max_samples)), desc="Preprocessing data"):
        summary = clean_text(dataset[i]["summary"])
        processed_texts.append(summary)
    
    return processed_texts


In [32]:
# Tokenizer class
class OptimizedByteLevelBPE:
    def __init__(self, merges=None, vocab=None, special_tokens=None):
        self.merges = merges or []
        self.vocab = vocab or {}
        self.special_tokens = special_tokens or ['<pad>', '<unk>', '<sos>', '<eos>']
        self._build_lookup_tables()

    def _build_lookup_tables(self):
        self.token_to_id = {}
        self.id_to_token = {}

        for idx, token in enumerate(self.special_tokens):
            self.token_to_id[token] = idx

        offset = len(self.token_to_id)
        for i in range(256):
            byte_token = f"{i:03d}"
            self.token_to_id[byte_token] = offset + i

        self.id_to_token = {v: k for k, v in self.token_to_id.items()}
        self.special_token_ids = {tok: self.token_to_id[tok] for tok in self.special_tokens}
        self.merges_set = set(tuple(m) for m in self.merges)


    def train(self, corpus, num_merges=10000, chunk_size=10000, verbose=True):
        global_freqs = Counter()

        for i in tqdm(range(0, len(corpus), chunk_size, desc="Vocabulary Construction")):
            chunk = corpus[i:i + chunk_size]
            text = " ".join(chunk)
            words = re.findall(r'\w+|[^\w\s]|\s+', text)

            for word in words:
                byte_tokens = [f"{b:03d}" for b in word.encode("utf-8")] + ["</w>"]
                global_freqs[" ".join(byte_tokens)] += 1

        vocab = global_freqs
        self.merges = []

        for merge_step in tqdm(range(num_merges), desc="BPE Merging"):
            pairs = self._get_stats(vocab)
            if not pairs:
                break

            best_pair = max(pairs.items(), key=lambda x: x[1])[0]
            vocab = self._merge_vocab(best_pair, vocab)
            self.merges.append(best_pair)

            if verbose and (merge_step % 1000 == 0 or merge_step == num_merges - 1):
                print(f"Merge {merge_step + 1}: {best_pair} (freq: {pairs[best_pair]})")

        self.merges_set = set(tuple(m) for m in self.merges)
        self._build_token_vocab()
        self._build_lookup_tables()

    def _get_stats(self, vocab):
        pairs = defaultdict(int)
        for word, freq in vocab.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[(symbols[i], symbols[i + 1])] += freq
        return pairs

    def _merge_vocab(self, pair, vocab):
        new_vocab = Counter()
        pattern = re.compile(rf'(?<!\S){re.escape(pair[0])} {re.escape(pair[1])}(?!\S)')

        for word, freq in vocab.items():
            new_word = pattern.sub(pair[0] + pair[1], word)
            new_vocab[new_word] = freq

        return new_vocab

    def encode(self, text, dropout=0.0):
        words = re.findall(r'\w+|[^\w\s]|\s+', text)
        token_ids = []

        for word in words:
            tokens = [f"{b:03d}" for b in word.encode("utf-8")]

            while len(tokens) > 1:
                pairs = [(tokens[i], tokens[i + 1]) for i in range(len(tokens) - 1)]
                valid_pairs = [p for p in pairs if p in self.merges_set and random.random() > dropout]
                if not valid_pairs:
                    break

                best_pair = min(valid_pairs, key=lambda p: self.merges.index(p))
                merged_token = best_pair[0] + best_pair[1]

                new_tokens = []
                i = 0
                while i < len(tokens):
                    if i < len(tokens) - 1 and (tokens[i], tokens[i + 1]) == best_pair:
                        new_tokens.append(merged_token)
                        i += 2
                    else:
                        new_tokens.append(tokens[i])
                        i += 1

                tokens = new_tokens

            for token in tokens:
                token_ids.append(self.token_to_id.get(token, self.special_token_ids["<unk>"]))

        return token_ids

    def decode(self, token_ids):
        tokens = [self.id_to_token.get(tid, '<unk>') for tid in token_ids]
        decoded_bytes = []

        for token in tokens:
            if token in self.special_token_ids:
                continue
            try:
                if len(token) == 6:
                    bytes_seq = [int(token[i:i+3]) for i in range(0, len(token), 3)]
                else:
                    bytes_seq = [int(token)]
                decoded_bytes.extend(bytes_seq)
            except ValueError:
                pass

        try:
            return bytes(decoded_bytes).decode('utf-8', errors='ignore')
        except Exception:
            return "Corrupted"

    def save_model(self, filepath):
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump({
                "merges": self.merges,
                "vocab": self.vocab,
                "special_tokens": self.special_tokens
            }, f, ensure_ascii=False)

    @classmethod
    def load_model(cls, filepath):
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
        merges = [tuple(m) for m in data["merges"]]
        return cls(merges=merges, vocab=data["vocab"], special_tokens=data["special_tokens"])

In [45]:
from safetensors.torch import save_model, load_model

# Model architecture
class Head(nn.Module):
    def __init__(self, head_size, n_embd, block_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * (C ** -0.5)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, n_embd, block_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, n_embd, block_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 2 * n_embd),
            nn.ReLU(),
            nn.Linear(2 * n_embd, n_embd),
            nn.Dropout(0.1),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head, block_size):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, block_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class Transformer(nn.Module):
    def __init__(self, vocab_size, n_embd=512, block_size=256, n_layer=6, n_head=8, tokenizer=None, device='cuda'):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head, block_size) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.block_size = block_size
        self.tokenizer = tokenizer
        self.device = device

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            logits = logits.view(B * T, -1)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens=100, temperature=1.0, top_k=30):
        self.eval()
        with torch.no_grad():
            for _ in range(max_new_tokens):
                idx_cond = idx[:, -self.block_size:]
                logits, _ = self(idx_cond)
                logits = logits[:, -1, :] / temperature

                if top_k is not None:
                    v, _ = torch.topk(logits, top_k)
                    logits[logits < v[:, [-1]]] = -float('Inf')

                probs = F.softmax(logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)
                idx = torch.cat([idx, next_token], dim=1)

        return idx

    def generate_from_prompt(self, prompt, max_new_tokens=100, temperature=1.0, top_k=None):
        self.eval()
        tokens = self.tokenizer.encode(prompt)
        context = torch.tensor(tokens, dtype=torch.long, device=self.device).unsqueeze(0)
        generated = self.generate(context, max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k)
        return self.tokenizer.decode(generated[0].tolist())

    def save_model(self, filepath):
        # Model yapılandırmasını ayrı bir JSON dosyasına kaydet
        config = {
            'vocab_size': len(self.tokenizer.token_to_id),
            'n_embd': self.token_embedding_table.embedding_dim,
            'block_size': self.block_size,
            'n_layer': len(self.blocks),
            'n_head': len(self.blocks[0].sa.heads),
            'tokenizer_config': {
                'merges': self.tokenizer.merges,
                'vocab': self.tokenizer.vocab,
                'special_tokens': self.tokenizer.special_tokens
            }
        }
        
        # Config'i ayrı bir dosyaya kaydet
        config_path = filepath.replace('.safetensors', '_config.json')
        with open(config_path, 'w', encoding='utf-8') as f:
            json.dump(config, f, ensure_ascii=False)
        
        # Model ağırlıklarını safetensors formatında kaydet
        save_model(self, filepath)
        print(f"Model saved to {filepath} and config to {config_path}")

    @classmethod
    def load_model(cls, filepath, device='cuda'):
        # Config dosyasını yükle
        config_path = filepath.replace('.safetensors', '_config.json')
        with open(config_path, 'r', encoding='utf-8') as f:
            config = json.load(f)
        
        # Tokenizer'ı oluştur
        tokenizer = OptimizedByteLevelBPE(
            merges=[tuple(m) for m in config['tokenizer_config']['merges']],
            vocab=config['tokenizer_config']['vocab'],
            special_tokens=config['tokenizer_config']['special_tokens']
        )

        
        # Modeli başlat
        model = cls(
            vocab_size=config['vocab_size'],
            n_embd=config['n_embd'],
            block_size=config['block_size'],
            n_layer=config['n_layer'],
            n_head=config['n_head'],
            tokenizer=tokenizer,
            device=device
        ).to(device)
        
        # Ağırlıkları yükle
        load_model(model, filepath, strict=True)
        model.eval()
        return model


In [None]:
# Training setup
def get_batch(data, block_size, batch_size):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)


@torch.no_grad()
def estimate_loss(model, train_data, val_data, block_size, batch_size, eval_iters):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(train_data if split == 'train' else val_data, block_size, batch_size)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def get_lr(it, warmup_iters=500, max_lr=1e-4, total_iters=10000):
    if it < warmup_iters:
        return max_lr * it / warmup_iters
    elif it > total_iters:
        return 0.0
    else:
        decay_ratio = (it - warmup_iters) / (total_iters - warmup_iters)
        return max_lr * 0.5 * (1.0 + math.cos(math.pi * decay_ratio))

In [None]:
def train_model():
    # Hyperparameters
    batch_size = 16
    block_size = 1024
    max_iters = 50000
    eval_interval = 500
    learning_rate = 3e-4
    eval_iters = 200
    n_embd = 1024
    n_head = 16
    n_layer = 12
    dropout = 0.2
    save_interval = 100 
    
    # Load and preprocess data
    full_corpus = load_and_preprocess_data(max_samples=50000)
    text = " ".join(full_corpus)
    
    # Initialize tokenizer
    tokenizer = OptimizedByteLevelBPE()
    tokenizer_path = "turkish_bpe_model.json"
    
    if not os.path.exists(tokenizer_path):
        print("Training tokenizer...")
        tokenizer.train(full_corpus, num_merges=3000, chunk_size=5000, verbose=True)
        tokenizer.save_model(tokenizer_path)
    else:
        print("Loading pretrained tokenizer...")
        tokenizer = OptimizedByteLevelBPE.load_model(tokenizer_path)
    
    # Tokenize text
    def encode_text(text):
        words = re.findall(r'\S+|\s+', text)
        tokens = []
        for word in tqdm(words, desc="Tokenizing"):
            tokens.extend(tokenizer.encode(word))
        return tokens
    
    tokens = encode_text(text)
    data = torch.tensor(tokens, dtype=torch.long)
    
    # Train/val split
    n = int(0.9 * len(data))
    train_data = data[:n]
    val_data = data[n:]
    
    print(f"Total tokens: {len(data)}")
    print(f"Train data size: {len(train_data)}")
    print(f"Val data size: {len(val_data)}")
    
    # Initialize model
    vocab_size = len(tokenizer.token_to_id)
    model = Transformer(
        vocab_size=vocab_size,
        n_embd=n_embd,
        block_size=block_size,
        n_layer=n_layer,
        n_head=n_head,
        tokenizer=tokenizer
    ).to(device)
    
    print(f"{sum(p.numel() for p in model.parameters())/1e6:.2f}M parameters")
    
    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-2)
    
    # Training loop
    best_val_loss = float('inf')
    patience_counter = 0
    patience = 3
    
    for iter in range(max_iters):
        # LR scheduling
        lr = get_lr(iter)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        
        # Get batch
        xb, yb = get_batch(train_data, block_size, batch_size)

        # Forward pass
        logits, loss = model(xb, yb)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        # Evaluation and logging
        if iter % eval_interval == 0 or iter == max_iters - 1:
            losses = estimate_loss(model, train_data, val_data, block_size, batch_size, eval_iters)
            train_loss = losses['train']
            val_loss = losses['val']
            
            print(f"Step {iter}: Train {train_loss:.4f}, Val {val_loss:.4f}, LR {lr:.6f}")
            
            # Save checkpoint
            if iter % save_interval == 0:
                checkpoint_path = f"checkpoints/checkpoint_{iter}.safetensors"
                model.save_model(checkpoint_path)
                print(f"Saved checkpoint to {checkpoint_path}")
            
            # Save best model
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                best_model_path = "checkpoints/best_model/best_model.safetensors"
                model.save_model(best_model_path)
                print(f"New best model saved to {best_model_path}")
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break

    return model

In [38]:
model = train_model()

Preprocessing data: 100%|██████████| 500/500 [00:00<00:00, 7536.26it/s]


Loading pretrained tokenizer...


Tokenizing: 100%|██████████| 35009/35009 [00:02<00:00, 13568.26it/s]


Total tokens: 72611
Train data size: 65349
Val data size: 7262
0.38M parameters
Step 0: Train 6.1170, Val 6.1168, LR 0.000000
Model saved to checkpoints/checkpoint_0.safetensors and config to checkpoints/checkpoint_0_config.json
Saved checkpoint to checkpoints/checkpoint_0.safetensors
Model saved to checkpoints/best_model/best_model.safetensors and config to checkpoints/best_model/best_model_config.json
New best model saved to checkpoints/best_model/best_model.safetensors
Step 500: Train 1.5764, Val 1.5263, LR 0.000300
Model saved to checkpoints/checkpoint_500.safetensors and config to checkpoints/checkpoint_500_config.json
Saved checkpoint to checkpoints/checkpoint_500.safetensors
Model saved to checkpoints/best_model/best_model.safetensors and config to checkpoints/best_model/best_model_config.json
New best model saved to checkpoints/best_model/best_model.safetensors
Step 999: Train 1.4914, Val 1.4513, LR 0.000298
Model saved to checkpoints/best_model/best_model.safetensors and confi

In [None]:
model_path = "checkpoints/checkpoint_0.safetensors"
model = Transformer.load_model(model_path, device)

prompt = "Çin'de yapılan bir araştırmaya göre Çin Seddi'nin yapımı tam 100 yıl sürmüştür ve"
generated = model.generate_from_prompt(
    prompt, 
    max_new_tokens=200, 
    temperature=0.5, 
    top_k=50
)
    
print("\nGenerated text:")
print(generated)