In [20]:
import os
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
torch.manual_seed(1337)

<torch._C.Generator at 0x7f79940f49d0>

In [24]:
import re
import unicodedata
from typing import List, Generator
from datasets import load_dataset
from tqdm import tqdm

dataset = load_dataset("musabg/wikipedia-tr-summarization", split='train')

def clean_text(text: str, remove_numbers: bool = False) -> str:
    """Türkçe metni temizle ve normalize et."""
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r'\[.*?\]|\(.*?\)', '', text)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()


def preprocess_data(batch_size: int = 1000, max_samples: int = 5000) -> Generator[List[str], None, None]:
    """Veri setini temizleyip işlenebilir hale getirir. max_samples kadar veri işler."""
    processed_texts = []

    for i in tqdm(range(0, min(len(dataset), max_samples)), desc="Data Preprocess", unit="sample"):
        summary = clean_text(dataset[i]["summary"])
        processed_texts.append(summary)

        if len(processed_texts) >= batch_size:
            yield processed_texts
            processed_texts = []

    if processed_texts:
        yield processed_texts



data_generator = preprocess_data(max_samples=5000)

full_corpus = []
for batch in data_generator:
    full_corpus.extend(batch)

print(f"{len(full_corpus)} data processed.")
print("Sample data:", full_corpus[0])

text = " ".join(full_corpus)

Data Preprocess: 100%|██████████| 5000/5000 [00:00<00:00, 10387.84sample/s]

5000 data processed.
Sample data: Çin'in Sichuan Eyaletinde yer alan Sichuan Dev Panda Barınakları, dünyadaki tehlike altındaki dev panda popülasyonunun %30'unu barındıran ve 9245 km2'lik bir alana yayılan park alanıdır.





In [28]:
import re
import json
import random
from collections import Counter, defaultdict
from tqdm import tqdm
from typing import List, Dict, Tuple, Optional

class OptimizedByteLevelBPE:
    def __init__(self, merges: Optional[List[Tuple[str, str]]] = None,
                 vocab: Optional[Dict[str, int]] = None,
                 special_tokens: Optional[List[str]] = None):
        self.merges = merges or []
        self.vocab = vocab or {}
        self.special_tokens = special_tokens or ['<pad>', '<unk>', '<sos>', '<eos>']
        self._build_lookup_tables()

    def _build_lookup_tables(self):
        """Token ve ID eşlemelerini kurar"""
        self.token_to_id = {}
        self.id_to_token = {}

        # Özel token'lar
        for idx, token in enumerate(self.special_tokens):
            self.token_to_id[token] = idx

        offset = len(self.token_to_id)

        # Byte token'lar (000 - 255)
        for i in range(256):
            byte_token = f"{i:03d}"
            self.token_to_id[byte_token] = offset + i

        self.id_to_token = {v: k for k, v in self.token_to_id.items()}
        self.special_token_ids = {tok: self.token_to_id[tok] for tok in self.special_tokens}
        self.merges_set = set(self.merges)

    def _build_token_vocab(self):
        """Merge sonrası oluşan token sözlüğünü oluşturur"""
        tokens = set()
        for a, b in self.merges:
            tokens.add(a)
            tokens.add(b)
            tokens.add(a + b)
        tokens = sorted(tokens)

        # Token ID'lerini devam ettir
        start_id = max(self.token_to_id.values()) + 1
        for tok in tokens:
            if tok not in self.token_to_id:
                self.token_to_id[tok] = start_id
                start_id += 1

        self.id_to_token = {v: k for k, v in self.token_to_id.items()}
        self.vocab = {k: v for k, v in self.token_to_id.items() if k not in self.special_tokens}

    def train(self, corpus: List[str], num_merges: int = 10000,
              chunk_size: int = 10000, verbose: bool = True):
        """BPE algoritması ile tokenizer'ı eğitir"""
        global_freqs = Counter()

        # 1. Frekansları hesapla
        for i in tqdm(range(0, len(corpus), chunk_size), desc="Vocabulary Construction"):
            chunk = corpus[i:i + chunk_size]
            text = " ".join(chunk)
            words = re.findall(r'\w+|[^\w\s]|\s+', text)

            for word in words:
                byte_tokens = [f"{b:03d}" for b in word.encode("utf-8")] + ["</w>"]
                global_freqs[" ".join(byte_tokens)] += 1

        # 2. Merge işlemleri
        vocab = global_freqs
        self.merges = []

        for merge_step in tqdm(range(num_merges), desc="BPE Merging"):
            pairs = self._get_stats(vocab)
            if not pairs:
                break

            best_pair = max(pairs.items(), key=lambda x: x[1])[0]
            vocab = self._merge_vocab(best_pair, vocab)
            self.merges.append(best_pair)

            if verbose and (merge_step % 1000 == 0 or merge_step == num_merges - 1):
                print(f"Merge {merge_step + 1}: {best_pair} (freq: {pairs[best_pair]})")

        self.merges_set = set(self.merges)
        self._build_token_vocab()
        self._build_lookup_tables()

    def _get_stats(self, vocab: Counter) -> Dict[Tuple[str, str], int]:
        """Sembollerin eş frekanslarını hesaplar"""
        pairs = defaultdict(int)
        for word, freq in vocab.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[(symbols[i], symbols[i + 1])] += freq
        return pairs

    def _merge_vocab(self, pair: Tuple[str, str], vocab: Counter) -> Counter:
        """En sık geçen çifti birleştirir"""
        new_vocab = Counter()
        pattern = re.compile(rf'(?<!\S){re.escape(pair[0])} {re.escape(pair[1])}(?!\S)')

        for word, freq in vocab.items():
            new_word = pattern.sub(pair[0] + pair[1], word)
            new_vocab[new_word] = freq

        return new_vocab

    def encode(self, text: str, dropout: float = 0.0) -> List[int]:
        """Metni token ID'lerine çevirir"""
        words = re.findall(r'\w+|[^\w\s]|\s+', text)
        token_ids = []

        for word in words:
            tokens = [f"{b:03d}" for b in word.encode("utf-8")]

            # BPE merge
            while len(tokens) > 1:
                pairs = [(tokens[i], tokens[i + 1]) for i in range(len(tokens) - 1)]
                valid_pairs = [
                    p for p in pairs
                    if p in self.merges_set and random.random() > dropout
                ]
                if not valid_pairs:
                    break

                best_pair = min(valid_pairs, key=lambda p: self.merges.index(p))
                merged_token = best_pair[0] + best_pair[1]

                new_tokens = []
                i = 0
                while i < len(tokens):
                    if i < len(tokens) - 1 and (tokens[i], tokens[i + 1]) == best_pair:
                        new_tokens.append(merged_token)
                        i += 2
                    else:
                        new_tokens.append(tokens[i])
                        i += 1

                tokens = new_tokens

            for token in tokens:
                token_ids.append(self.token_to_id.get(token, self.special_token_ids["<unk>"]))

        return token_ids

    def decode(self, token_ids: List[int]) -> str:
        """Token ID'lerinden orijinal metni oluşturur"""
        tokens = [self.id_to_token.get(tid, '<unk>') for tid in token_ids]
        decoded_bytes = []

        for token in tokens:
            if token in self.special_token_ids:
                continue  # özel token'ları atla
            try:
                if len(token) == 6:  # örn: "196195" gibi birleşmiş token
                    bytes_seq = [int(token[i:i+3]) for i in range(0, len(token), 3)]
                else:
                    bytes_seq = [int(token)]
                decoded_bytes.extend(bytes_seq)
            except ValueError:
                pass  # bilinmeyen token veya özel karakter varsa yoksay

        try:
            return bytes(decoded_bytes).decode('utf-8', errors='ignore')
        except Exception:
            return "Corrupted"  # bozulmuş veri varsa boş döndür


    def save_model(self, prefix: str):
        """Modeli diske kaydeder"""
        with open(prefix, "w", encoding="utf-8") as f:
            json.dump({
                "merges": self.merges,
                "vocab": self.vocab,
                "special_tokens": self.special_tokens
            }, f, ensure_ascii=False)

    @classmethod
    def load_model(cls, prefix: str):
        """Kaydedilmiş tokenizer modelini yükler"""
        with open(prefix, "r", encoding="utf-8") as f:
            data = json.load(f)
        merges = [tuple(m) for m in data["merges"]]
        return cls(
            merges=merges,
            vocab=data["vocab"],
            special_tokens=data["special_tokens"]
        )


In [29]:
tokenizer = OptimizedByteLevelBPE()

#tokenizer.train(
#    full_corpus,
#    num_merges=3000,
#    chunk_size=5000,
#    verbose=True
#)
#tokenizer.save_model('turkish_bpe')

In [31]:
tokenizer.load_model('turkish_bpe_model.json')

# Metni encode/decode etme
encoded = tokenizer.encode("şçöğü")
decoded = tokenizer.decode(encoded)
print(decoded)  # "merhaba dünya"

şçöğü


In [None]:
test_cases = [
    "İstanbul'da şehir içi ulaşım çok karmaşık",
    "Pijamalı hasta yağız şoföre çabucak güvendi",
    "Fahiş fiyatlarla mücadele ederken güğümsü renkler içinde"
]

for _text in test_cases:
    print(f"\nTesting: {_text}")
    encoded = tokenizer.encode(_text)
    decoded = tokenizer.decode(encoded)
    print("Success!" if decoded == _text else "Failed!")


Testing: İstanbul'da şehir içi ulaşım çok karmaşık
Success!

Testing: Pijamalı hasta yağız şoföre çabucak güvendi
Success!

Testing: Fahiş fiyatlarla mücadele ederken güğümsü renkler içinde
Success!


In [34]:
from tqdm import tqdm

def encode_text_with_bpe_ids(bpe_obj, text):
    tokens = []
    words = re.findall(r'\S+|\s+', text)
    for word in tqdm(words, desc="Encoding with BPE"):
        tokens.extend(bpe_obj.encode(word))
    return tokens

tokens = encode_text_with_bpe_ids(tokenizer, text)
data = torch.tensor(tokens, dtype=torch.long)

n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

print(f"Total token: {len(data)}")
print(f"Train data size: {len(train_data)}")
print(f"Val data size: {len(val_data)}")

Encoding with BPE: 100%|██████████| 355077/355077 [00:02<00:00, 129658.31it/s]


Total token: 1470241
Train data size: 1323216
Val data size: 147025


In [35]:
# data loading
def get_batch(split):
    data_split = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    x = torch.stack([data_split[i:i+block_size] for i in ix])
    y = torch.stack([data_split[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

In [36]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [37]:
class Head(nn.Module):
    def __init__(self, head_size, n_embd, block_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * (C ** -0.5)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

In [38]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, n_embd, block_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, n_embd, block_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


In [39]:
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 2 * n_embd),
            nn.ReLU(),
            nn.Linear(2 * n_embd, n_embd),
            nn.Dropout(0.1),
        )

    def forward(self, x):
        return self.net(x)

In [40]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head, block_size):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, block_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [86]:
class Transformers(nn.Module):
    def __init__(self, vocab_size, n_embd=512, block_size=256, n_layer=6, n_head=8, tokenizer=None):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head, block_size) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.block_size = block_size
        self.tokenizer = tokenizer  # Injected tokenizer

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            logits = logits.view(B * T, -1)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens=100, temperature=1.0, top_k=30):
        self.eval()
        with torch.no_grad():
            for _ in range(max_new_tokens):
                idx_cond = idx[:, -self.block_size:]
                logits, _ = self(idx_cond)
                logits = logits[:, -1, :] / temperature

                if top_k is not None:
                    v, _ = torch.topk(logits, top_k)
                    logits[logits < v[:, [-1]]] = -float('Inf')

                probs = F.softmax(logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)
                idx = torch.cat([idx, next_token], dim=1)

        return idx

    def generate_from_prompt(self, prompt, max_new_tokens=100, temperature=1.0, top_k=30):
        assert self.tokenizer is not None, "Tokenizer must be provided"
        idx = torch.tensor([self.tokenizer.encode(prompt)], dtype=torch.long).to(device)
        out = self.generate(idx, max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k)
        return self.tokenizer.decode(out[0].tolist())
    
    @classmethod
    def from_pretrained(cls, model_path, tokenizer, **kwargs):
        model = cls(tokenizer=tokenizer, **kwargs)
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.to(device)
        model.eval()
        return model
    




In [71]:
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 10000
eval_interval = 500
learning_rate = 3e-4

grad_clip = 1.0
best_val_loss = float('inf')
patience_counter = 0
patience = 3

eval_iters = 200
n_embd = 768
n_head = 12
n_layer = 8
dropout = 0.2
# ------------

vocab_size = len(tokenizer.token_to_id)
vocab_size

260

In [None]:
model = Transformers(vocab_size=vocab_size, n_embd=n_embd, block_size=block_size, n_layer=n_layer, n_head=n_head)
model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-2)

38.395652 M parameters


In [None]:
import torch
import math


def get_lr(it, warmup_iters=500, max_lr=1e-3, total_iters=5000):
    if it < warmup_iters:
        return max_lr * it / warmup_iters
    elif it > total_iters:
        return 0.0
    else:
        decay_ratio = (it - warmup_iters) / (total_iters - warmup_iters)
        return max_lr * 0.5 * (1.0 + math.cos(math.pi * decay_ratio))

for iter in range(max_iters):
    # Learning rate scheduler
    lr = get_lr(iter)
    for g in optimizer.param_groups:
        g['lr'] = lr

    # Değerlendirme ve log
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        train_loss = losses['train']
        val_loss = losses['val']

        print(f"Step {iter}: Train {train_loss:.4f}, Val {val_loss:.4f}, LR {lr:.6f}")
      
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pt')
            print("The new model is better than the old model. The best model has been updated.")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping.")
                break

    # Eğitim adımı
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()

In [87]:
model = Transformers.from_pretrained("best_model_wiki.pt", tokenizer=tokenizer,
                                     vocab_size=vocab_size, n_embd=n_embd,
                                     block_size=block_size, n_layer=n_layer, n_head=n_head)

text = model.generate_from_prompt("Çin'de yapılan bir araştırmaya göre Çin Seddi'nin yapımı tam 100 yıl sürmüştür ve", max_new_tokens=256, temperature=0.5, top_k=30)
print(text)


  model.load_state_dict(torch.load(model_path, map_location=device))


Çin'de yapılan bir araştırmaya göre Çin Seddi'nin yapımı tam 100 yıl sürmüştür ve sonrasında Manisa'ya yerleşerek İstanbul'u ele geçirme seçeneklerini sağlamıştır. İranlı asker ve siyasetçi Ali Paşa, İran'da doğdu ve İran'da hayatını kaybetti; ayrıca Maliye Bakanlığı ve Karşılaştırma Enstitüsü gibi durumlar 
