<a href="https://colab.research.google.com/github/emredeveloper/Transformers--General-AI/blob/main/Transformer_Attention_FFN_Varyantlari_Performans_T.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
# Source for "Build a Large Language Model From Scratch"
#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
# Code: https://github.com/rasbt/LLMs-from-scratch

# This file collects all the relevant code that we covered thus far
# throughout Chapters 2-4.
# This file can be run as a standalone script.

import tiktoken
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt


#####################################
# Chapter 2
#####################################

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader


#####################################
# Chapter 3
#####################################

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by n_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.reshape(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # optional projection

        return context_vec


#####################################
# Chapter 4
#####################################

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x


class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):

        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]

        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)

        # Focus only on the last time step
        # (batch, n_token, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]

        # Get the idx of the vocab entry with the highest logits value
        idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx


#####################################
# Chapter 5
####################################


def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size)
        decoded_text = token_ids_to_text(token_ids, tokenizer)
        print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()


def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
    fig, ax1 = plt.subplots(figsize=(5, 3))

    # Plot training and validation loss against epochs
    ax1.plot(epochs_seen, train_losses, label="Training loss")
    ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.legend(loc="upper right")

    # Create a second x-axis for tokens seen
    ax2 = ax1.twiny()  # Create a second x-axis that shares the same y-axis
    ax2.plot(tokens_seen, train_losses, alpha=0)  # Invisible plot for aligning ticks
    ax2.set_xlabel("Tokens seen")

    fig.tight_layout()  # Adjust layout to make room
    # plt.show()


def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text)
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension
    return encoded_tensor


def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)  # remove batch dimension
    return tokenizer.decode(flat.tolist())


In [None]:
import time
import torch
import torch.optim as optim

#####################################
# Ayarlar ve Veri Hazırlığı
#####################################

def load_data():
    # Demo amaçlı küçük bir metin. Gerçek uygulamada daha büyük bir corpus kullanılmalı.
    text = ("Once upon a time, in a land far, far away, there was a kingdom where magic was common "
            "and adventure awaited around every corner. ") * 100  # metni tekrarlayarak uzunluyoruz
    return text

def prepare_dataloaders(text, batch_size=4, max_length=128, stride=64):
    # Eğitim ve doğrulama için veriyi bölelim (örneğin, %90 eğitim, %10 doğrulama)
    split_idx = int(0.9 * len(text))
    train_text = text[:split_idx]
    val_text = text[split_idx:]
    train_loader = create_dataloader_v1(train_text, batch_size=batch_size,
                                        max_length=max_length, stride=stride)
    val_loader = create_dataloader_v1(val_text, batch_size=batch_size,
                                      max_length=max_length, stride=stride)
    return train_loader, val_loader

#####################################
# Model Eğitimi
#####################################

def train_model(model, train_loader, val_loader, device, epochs=5, eval_iter=10):
    optimizer = optim.Adam(model.parameters(), lr=3e-4)
    model.to(device)

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0
        start_time = time.time()

        for batch_idx, (input_batch, target_batch) in enumerate(train_loader):
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

            if (batch_idx + 1) % 10 == 0:
                print(f"Epoch {epoch+1} Batch {batch_idx+1}, Loss: {loss.item():.4f}")

        avg_loss = epoch_loss / len(train_loader)
        elapsed = time.time() - start_time
        print(f"Epoch {epoch+1} tamamlandı (süre: {elapsed:.2f}s), ort. loss: {avg_loss:.4f}")

        # Kısa bir değerlendirme: eğitim ve doğrulama loss değerlerini hesapla
        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
        print(f"Epoch {epoch+1} değerlendirme: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}\n")

#####################################
# Metin Üretimi
#####################################

def generate_sample(model, tokenizer, device, prompt, max_new_tokens=50):
    print("Üretilen metin örneği:\n")
    generate_and_print_sample(model, tokenizer, device, prompt)

#####################################
# Ana Fonksiyon
#####################################

def main():
    # Cihaz seçimi (GPU varsa kullanılır)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Kullanılan cihaz: {device}")

    # Veri hazırlığı
    text = load_data()
    train_loader, val_loader = prepare_dataloaders(text, batch_size=4, max_length=128, stride=64)

    # Tokenizer ve model konfigürasyonu
    tokenizer = tiktoken.get_encoding("gpt2")
    cfg = {
        "vocab_size": tokenizer.n_vocab,  # Tokenizer'ın sözlüğündeki kelime sayısı
        "emb_dim": 128,                   # Küçük bir embedding boyutu (demo amaçlı)
        "context_length": 128,            # Maksimum dizi uzunluğu
        "drop_rate": 0.1,
        "n_layers": 8,                    # Katman sayısı
        "n_heads": 4,                     # Çoklu başlık sayısı (emb_dim'in tam böleni olmalı)
        "qkv_bias": True,
    }

    # Model oluşturulması
    model = GPTModel(cfg)

    # Modelin eğitimi
    train_model(model, train_loader, val_loader, device, epochs=5, eval_iter=10)

    # Eğitim bittikten sonra, bir başlangıç prompt'u ile metin üretimi yapalım
    prompt = "Once upon a time "
    generate_sample(model, tokenizer, device, prompt, max_new_tokens=50)

if __name__ == "__main__":
    main()


In [None]:
import time
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import tiktoken
from datasets import load_dataset  # Hugging Face datasets library
import re

#####################################
# Rotary Positional Embeddings (ROPE) Implementation
#####################################
def apply_rotary_pos_emb(x):
    """
    Apply Rotary Positional Embeddings (ROPE) to the input tensor.

    Args:
        x (torch.Tensor): Input tensor of shape (batch, num_heads, seq_len, head_dim).

    Returns:
        torch.Tensor: Tensor with ROPE applied.
    """
    batch, n_heads, seq_len, head_dim = x.shape
    assert head_dim % 2 == 0, "head_dim must be even for ROPE"

    # Calculate inverse frequencies and positions
    inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2, device=x.device).float() / head_dim))
    positions = torch.arange(seq_len, device=x.device).float()
    sinusoid_inp = torch.einsum("i,j->ij", positions, inv_freq)  # (seq_len, head_dim/2)
    sin = torch.sin(sinusoid_inp)[None, None, :, :]  # (1, 1, seq_len, head_dim/2)
    cos = torch.cos(sinusoid_inp)[None, None, :, :]  # (1, 1, seq_len, head_dim/2)

    # Split the input tensor into two halves and apply ROPE
    x1, x2 = x[..., :head_dim//2], x[..., head_dim//2:]
    x_rotated = torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)
    return x_rotated

#####################################
# Dataset and DataLoader: Wikitext (Hugging Face)
#####################################
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the text
        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

        # Create input-target pairs
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(text, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader

def load_wikitext_data(num_lines=10000, dataset_name="wikitext", subset="wikitext-103-raw-v1"):
    """
    Load Wikitext data from Hugging Face and concatenate the first `num_lines` lines into a single text.

    Args:
        num_lines (int): Number of lines to load.
        dataset_name (str): Name of the dataset.
        subset (str): Subset of the dataset.

    Returns:
        str: Concatenated text.
    """
    ds = load_dataset(dataset_name, subset)
    text_lines = ds["train"]["text"][:num_lines]
    text = "\n".join(text_lines)
    return text

def preprocess_text(text):
    """
    Preprocess the text data by removing unwanted characters and normalizing whitespace.

    Args:
        text (str): Input text.

    Returns:
        str: Preprocessed text.
    """
    # Remove special characters and digits, and normalize whitespace
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

#####################################
# Advanced Model Components (GPTModel)
#####################################

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False, use_rope=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.use_rope = use_rope
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, _ = x.shape

        keys = self.W_key(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        queries = self.W_query(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        values = self.W_value(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)

        if self.use_rope:
            queries = apply_rotary_pos_emb(queries)
            keys = apply_rotary_pos_emb(keys)

        attn_scores = queries @ keys.transpose(2, 3)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, float("-inf"))

        attn_weights = torch.softmax(attn_scores / math.sqrt(self.head_dim), dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1, 2).reshape(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)
        return context_vec

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

class GELU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"],
            use_rope=cfg.get("use_rope", False)
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

#####################################
# Training and Evaluation Functions
#####################################
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.0
    if len(data_loader) == 0:
        return float("nan")
    num_batches = num_batches if num_batches is not None else len(data_loader)
    num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        idx_next = torch.argmax(logits, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(model=model, idx=encoded, max_new_tokens=50, context_size=context_size)
        decoded_text = token_ids_to_text(token_ids, tokenizer)
        print(decoded_text.replace("\n", " "))
    model.train()

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text)
    return torch.tensor(encoded).unsqueeze(0)

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
    fig, ax1 = plt.subplots(figsize=(5, 3))
    ax1.plot(epochs_seen, train_losses, label="Training loss")
    ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.legend(loc="upper right")
    ax2 = ax1.twiny()
    ax2.plot(tokens_seen, train_losses, alpha=0)
    ax2.set_xlabel("Tokens seen")
    fig.tight_layout()
    plt.show()

#####################################
# Model Training
#####################################
def train_model(model, train_loader, val_loader, device, epochs=30, eval_iter=20, lr=1e-4):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.to(device)
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0
        start_time = time.time()

        for batch_idx, (input_batch, target_batch) in enumerate(train_loader):
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            if (batch_idx + 1) % 10 == 0:
                print(f"Epoch {epoch+1} Batch {batch_idx+1}, Loss: {loss.item():.4f}")

        avg_loss = epoch_loss / len(train_loader)
        elapsed = time.time() - start_time
        print(f"Epoch {epoch+1} completed (time: {elapsed:.2f}s), avg. loss: {avg_loss:.4f}")
        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
        print(f"Epoch {epoch+1} evaluation: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}\n")

#####################################
# Text Generation
#####################################
def generate_sample(model, tokenizer, device, prompt, max_new_tokens=50):
    print("Generated text sample:\n")
    generate_and_print_sample(model, tokenizer, device, prompt)

#####################################
# Main Function
#####################################
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load Wikitext data from Hugging Face and use the first 50k lines
    text = load_wikitext_data(num_lines=50000, dataset_name="wikitext", subset="wikitext-103-raw-v1")

    # Preprocess the text data
    text = preprocess_text(text)

    # Split data into training and validation sets (e.g., 90% train, 10% validation)
    split_idx = int(0.9 * len(text))
    train_text = text[:split_idx]
    val_text = text[split_idx:]
    train_loader = create_dataloader_v1(train_text, batch_size=8, max_length=256, stride=128)
    val_loader = create_dataloader_v1(val_text, batch_size=8, max_length=256, stride=128)

    # Tokenizer and advanced model configuration
    tokenizer = tiktoken.get_encoding("gpt2")
    cfg = {
        "vocab_size": tokenizer.n_vocab,
        "emb_dim": 256,
        "context_length": 256,
        "drop_rate": 0.1,
        "n_layers": 6,
        "n_heads": 8,
        "qkv_bias": True,
        "use_rope": True,
    }

    model = GPTModel(cfg)
    train_model(model, train_loader, val_loader, device, epochs=1, eval_iter=25, lr=1e-5)

    # Generate text after training with a given prompt
    prompt = "Valkyria Chronicles III "
    generate_sample(model, tokenizer, device, prompt, max_new_tokens=100)

if __name__ == "__main__":
    main()


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
import time

#############################################
# 1. Alternatif Normalizasyon: RMSNorm
#############################################
class RMSNorm(nn.Module):
    def __init__(self, emb_dim, eps=1e-8):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(emb_dim))
    def forward(self, x):
        # x shape: (..., emb_dim)
        norm_x = x / torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + self.eps)
        return self.scale * norm_x

def get_norm(norm_type, emb_dim):
    if norm_type == 'layernorm':
        return nn.LayerNorm(emb_dim)
    elif norm_type == 'rmsnorm':
        return RMSNorm(emb_dim)
    else:
        raise ValueError("Unknown normalization type")

#############################################
# 2. Ortak Konfigürasyon
#############################################
class Config:
    def __init__(self, vocab_size=30522, emb_dim=768, max_length=512, n_layers=4, n_heads=12,
                 dropout=0.1, norm_type='layernorm'):
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.max_length = max_length
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.dropout = dropout
        self.norm_type = norm_type  # 'layernorm' veya 'rmsnorm'
        # Advanced varyantlar için ek parametreler:
        self.latent_dim = emb_dim // 2   # RoPE ve latent projeksiyon için
        self.num_experts = 4             # MoE FFN’de kullanılacak uzman sayısı

#############################################
# --- Attention Modülleri ---
#############################################
# 1. Standard Dot-Product Attention
class StandardAttention(nn.Module):
    def __init__(self, emb_dim, n_heads, dropout):
        super().__init__()
        assert emb_dim % n_heads == 0, "Embedding boyutu baş sayısına tam bölünmeli."
        self.n_heads = n_heads
        self.head_dim = emb_dim // n_heads
        self.q_proj = nn.Linear(emb_dim, emb_dim)
        self.k_proj = nn.Linear(emb_dim, emb_dim)
        self.v_proj = nn.Linear(emb_dim, emb_dim)
        self.out_proj = nn.Linear(emb_dim, emb_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        batch, seq_len, emb_dim = x.size()
        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)
        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)
        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        context = torch.matmul(attn, V)
        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)
        return self.out_proj(context)

# 2. RoPE Attention
def apply_rope(x, base=10000):
    # x: (batch, n_heads, seq_len, head_dim)
    batch, n_heads, seq_len, head_dim = x.shape
    inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device=x.device).float() / head_dim))
    pos = torch.arange(seq_len, device=x.device).float()
    sinusoid_inp = torch.einsum("i,j->ij", pos, inv_freq)  # (seq_len, head_dim/2)
    sin = torch.sin(sinusoid_inp).unsqueeze(0).unsqueeze(0)
    cos = torch.cos(sinusoid_inp).unsqueeze(0).unsqueeze(0)
    x1 = x[..., :head_dim//2]
    x2 = x[..., head_dim//2:]
    return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)

class RoPEAttention(nn.Module):
    def __init__(self, emb_dim, n_heads, dropout):
        super().__init__()
        assert emb_dim % n_heads == 0, "Embedding boyutu baş sayısına tam bölünmeli."
        self.n_heads = n_heads
        self.head_dim = emb_dim // n_heads
        self.q_proj = nn.Linear(emb_dim, emb_dim)
        self.k_proj = nn.Linear(emb_dim, emb_dim)
        self.v_proj = nn.Linear(emb_dim, emb_dim)
        self.out_proj = nn.Linear(emb_dim, emb_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        batch, seq_len, emb_dim = x.size()
        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)
        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)
        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)
        Q = apply_rope(Q)
        K = apply_rope(K)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        context = torch.matmul(attn, V)
        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)
        return self.out_proj(context)

# 3. FlashAttention benzeri Attention (placeholder)
def flash_attention(Q, K, V):
    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(Q.size(-1))
    attn = torch.softmax(scores, dim=-1)
    return torch.matmul(attn, V)

class FlashAttentionModule(nn.Module):
    def __init__(self, emb_dim, n_heads, dropout):
        super().__init__()
        assert emb_dim % n_heads == 0, "Embedding boyutu baş sayısına tam bölünmeli."
        self.n_heads = n_heads
        self.head_dim = emb_dim // n_heads
        self.q_proj = nn.Linear(emb_dim, emb_dim)
        self.k_proj = nn.Linear(emb_dim, emb_dim)
        self.v_proj = nn.Linear(emb_dim, emb_dim)
        self.out_proj = nn.Linear(emb_dim, emb_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        batch, seq_len, emb_dim = x.size()
        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)
        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)
        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)
        context = flash_attention(Q, K, V)
        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)
        return self.out_proj(context)

# 4. Multi-Query Attention: Keys & Values tek projeksiyon
class MultiQueryAttention(nn.Module):
    def __init__(self, emb_dim, n_heads, dropout):
        super().__init__()
        self.n_heads = n_heads
        self.head_dim = emb_dim // n_heads
        self.q_proj = nn.Linear(emb_dim, emb_dim)
        self.k_proj = nn.Linear(emb_dim, self.head_dim)
        self.v_proj = nn.Linear(emb_dim, self.head_dim)
        self.out_proj = nn.Linear(emb_dim, emb_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        batch, seq_len, emb_dim = x.size()
        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)
        K = self.k_proj(x).unsqueeze(1).expand(batch, self.n_heads, seq_len, self.head_dim)
        V = self.v_proj(x).unsqueeze(1).expand(batch, self.n_heads, seq_len, self.head_dim)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        context = torch.matmul(attn, V)
        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)
        return self.out_proj(context)

# 5. ALiBi Attention: Lineer bias ekleyerek göreceli pozisyon bilgisini entegre eder (Press et al., 2021)
class ALiBiAttention(nn.Module):
    def __init__(self, emb_dim, n_heads, dropout, alibi_scaling=-1.0):
        super().__init__()
        assert emb_dim % n_heads == 0, "Embedding boyutu, baş sayısına tam bölünmeli."
        self.n_heads = n_heads
        self.head_dim = emb_dim // n_heads
        self.q_proj = nn.Linear(emb_dim, emb_dim)
        self.k_proj = nn.Linear(emb_dim, emb_dim)
        self.v_proj = nn.Linear(emb_dim, emb_dim)
        self.out_proj = nn.Linear(emb_dim, emb_dim)
        self.dropout = nn.Dropout(dropout)
        self.alibi_scaling = alibi_scaling

    def forward(self, x):
        batch, seq_len, emb_dim = x.size()
        Q = self.q_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)
        K = self.k_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)
        V = self.v_proj(x).view(batch, seq_len, self.n_heads, self.head_dim).transpose(1,2)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        # ALiBi bias: B[i,j] = (j - i) * scale
        bias = torch.arange(seq_len, device=x.device).unsqueeze(0) - torch.arange(seq_len, device=x.device).unsqueeze(1)
        bias = self.alibi_scaling * bias.float()
        scores = scores + bias.unsqueeze(0).unsqueeze(0)
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        context = torch.matmul(attn, V)
        context = context.transpose(1,2).contiguous().view(batch, seq_len, emb_dim)
        return self.out_proj(context)

#############################################
# --- FFN Varyantları ---
#############################################
# 1. Standart FFN
class StandardFFN(nn.Module):
    def __init__(self, emb_dim, expansion=4, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(emb_dim, expansion * emb_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(expansion * emb_dim, emb_dim)
        )
    def forward(self, x):
        return self.net(x)

# 2. MoE FFN
class MoEFFN(nn.Module):
    def __init__(self, emb_dim, num_experts, expansion=4, dropout=0.1):
        super().__init__()
        self.num_experts = num_experts
        self.experts = nn.ModuleList([
            nn.Sequential(
                nn.Linear(emb_dim, expansion * emb_dim),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Linear(expansion * emb_dim, emb_dim)
            ) for _ in range(num_experts)
        ])
        self.gate = nn.Linear(emb_dim, num_experts)

    def forward(self, x):
        gate_scores = torch.softmax(self.gate(x), dim=-1)  # (batch, seq_len, num_experts)
        expert_outputs = torch.stack([expert(x) for expert in self.experts], dim=-1)  # (batch, seq_len, emb_dim, num_experts)
        gate_scores = gate_scores.unsqueeze(2)  # (batch, seq_len, 1, num_experts)
        return (expert_outputs * gate_scores).sum(dim=-1)

#############################################
# --- Transformer Bloğu: Seçilebilir Attention ve FFN varyantları, Dropout, Pre-Norm ---
#############################################
class TransformerBlock(nn.Module):
    def __init__(self, emb_dim, n_heads, attn_module, ffn_module, dropout, norm_type):
        super().__init__()
        self.norm1 = get_norm(norm_type, emb_dim)
        self.attn = attn_module(emb_dim, n_heads, dropout)
        self.norm2 = get_norm(norm_type, emb_dim)
        self.ffn = ffn_module(emb_dim, dropout=dropout)  # ffn_module: StandardFFN or MoEFFN (for MoE, lambda is used)

    def forward(self, x):
        x = x + self.attn(self.norm1(x))
        x = x + self.ffn(self.norm2(x))
        return x

#############################################
# --- Transformer Modeli: Farklı varyantların seçilebildiği yapı ---
#############################################
class TransformerModel(nn.Module):
    def __init__(self, config, attn_variant='standard', ffn_variant='standard'):
        super().__init__()
        self.token_embed = nn.Embedding(config.vocab_size, config.emb_dim)
        self.pos_embed   = nn.Embedding(config.max_length, config.emb_dim)

        attn_dict = {
            'standard': StandardAttention,
            'rope': RoPEAttention,
            'flash': FlashAttentionModule,
            'multiquery': MultiQueryAttention,
            'alibi': ALiBiAttention
        }
        ffn_dict = {
            'standard': StandardFFN,
            'moe': lambda emb_dim, dropout: MoEFFN(emb_dim, config.num_experts, dropout=dropout)
        }
        self.layers = nn.ModuleList([
            TransformerBlock(config.emb_dim, config.n_heads, attn_dict[attn_variant], ffn_dict[ffn_variant], config.dropout, config.norm_type)
            for _ in range(config.n_layers)
        ])
        self.norm = get_norm(config.norm_type, config.emb_dim)
        self.output_proj = nn.Linear(config.emb_dim, config.vocab_size, bias=False)

    def forward(self, x):
        seq_len = x.size(1)
        x = self.token_embed(x) + self.pos_embed(torch.arange(seq_len, device=x.device))
        for layer in self.layers:
            x = layer(x)
        x = self.norm(x)
        return self.output_proj(x)

#############################################
# --- Ek: Model Özeti ve Parametre Sayısı Fonksiyonu ---
#############################################
def model_summary(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Toplam Parametre: {total_params:,}")
    print(f"Eğitilebilir Parametre: {trainable:,}")

#############################################
# --- Ek: Greedy Decoding Fonksiyonu ---
#############################################
def greedy_decode(model, start_token, max_length, device):
    model.eval()
    generated = [start_token]
    input_seq = torch.tensor([generated], device=device)
    with torch.no_grad():
        for _ in range(max_length - 1):
            logits = model(input_seq)  # (batch, seq_len, vocab_size)
            next_token = torch.argmax(logits[0, -1, :]).item()
            generated.append(next_token)
            input_seq = torch.tensor([generated], device=device)
    model.train()
    return generated

#############################################
# --- Ek: Basit Eğitim Döngüsü (Training Loop) ---
#############################################
def train_model(model, config, epochs=3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=1e-4)
    loss_fn = nn.CrossEntropyLoss()
    # Dummy dataset: rastgele token dizileri
    for epoch in range(epochs):
        model.train()
        dummy_input = torch.randint(0, config.vocab_size, (8, config.max_length), device=device)
        dummy_target = torch.randint(0, config.vocab_size, (8, config.max_length), device=device)
        optimizer.zero_grad()
        logits = model(dummy_input)  # (batch, seq_len, vocab_size)
        loss = loss_fn(logits.view(-1, config.vocab_size), dummy_target.view(-1))
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

#############################################
# --- Detaylı Test Fonksiyonları (Önceki Versiyonun Geliştirilmiş Hali) ---
#############################################
def run_detailed_tests(config, variant_list):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    for variant in variant_list:
        attn_var = variant['attn']
        ffn_var = variant['ffn']
        print(f"\nTest: Attention = {attn_var}, FFN = {ffn_var}")
        model = TransformerModel(config, attn_variant=attn_var, ffn_variant=ffn_var).to(device)
        model_summary(model)
        model.train()
        dummy_input = torch.randint(0, config.vocab_size, (4, config.max_length), device=device)
        logits = model(dummy_input)
        loss = nn.CrossEntropyLoss()(logits.view(-1, config.vocab_size),
                                     torch.randint(0, config.vocab_size, (4 * config.max_length,), device=device))
        loss.backward()
        print(f"Loss: {loss.item():.4f}, Output shape: {logits.shape}")

        torch.cuda.synchronize() if device.type == 'cuda' else None
        start_time = time.time()
        for _ in range(10):
            _ = model(dummy_input)
        torch.cuda.synchronize() if device.type == 'cuda' else None
        avg_time = (time.time() - start_time) / 10.0
        print(f"Ortalama ileri geçiş süresi: {avg_time:.6f} sn")

        # Greedy decoding test (ilk 10 token üretiliyor)
        start_token = dummy_input[0, 0].item()
        generated = greedy_decode(model, start_token, max_length=10, device=device)
        print(f"Greedy Decode Çıktısı: {generated}")

#############################################
# --- Ana Çalışma Bölümü: Farklı varyantları deneyelim ---
#############################################
if __name__ == "__main__":
    # Konfigürasyona norm tipi ve dropout eklenmiştir.
    config = Config(vocab_size=30522, emb_dim=768, max_length=128, n_layers=4, n_heads=12, dropout=0.1, norm_type='rmsnorm')

    # Denenecek varyantlar: farklı attention ve FFN varyantları
    variant_list = [
        {'attn': 'standard', 'ffn': 'standard'},
        {'attn': 'rope',     'ffn': 'standard'},
        {'attn': 'flash',    'ffn': 'standard'},
        {'attn': 'multiquery', 'ffn': 'standard'},
        {'attn': 'alibi',    'ffn': 'standard'},
        {'attn': 'standard', 'ffn': 'moe'},
        {'attn': 'rope',     'ffn': 'moe'},
        {'attn': 'flash',    'ffn': 'moe'},
        {'attn': 'multiquery', 'ffn': 'moe'},
        {'attn': 'alibi',    'ffn': 'moe'},
    ]

    print("=== Detaylı Varyant Testleri ===")
    run_detailed_tests(config, variant_list)

    print("\n=== Eğitim Döngüsü Testi ===")
    # Bir varyant seçelim (örneğin, gelişmiş varyant: RoPE + MoE FFN)
    model = TransformerModel(config, attn_variant='rope', ffn_variant='moe').to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    train_model(model, config, epochs=3)

    print("\n=== Greedy Decoding Testi ===")
    # Greedy decoding örneği: İlk tokenı dummy inputtan alıp 20 token üretelim
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    prompt_token = torch.randint(0, config.vocab_size, (1,)).item()
    generated_tokens = greedy_decode(model, prompt_token, max_length=20, device=device)
    print("Üretilen Tokenlar:", generated_tokens)

In [None]:
#!pip install evaluate reportlab

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
from datasets import load_dataset
from collections import defaultdict
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas

#############################################
# Turkish-Alpaca Veri Seti ve Tokenizer
#############################################
class TurkishAlpacaDataset:
    def __init__(self, config):
        # Hugging Face'den veri setini yükle
        dataset = load_dataset("TFLai/Turkish-Alpaca")
        self.instructions = dataset['train']['instruction'][:100]  # Limit to 100 samples
        self.outputs = dataset['train']['output'][:100]            # Limit to 100 samples

        # Tokenizer oluştur
        self.vocab = defaultdict(lambda: len(self.vocab))
        self.vocab['<pad>'] = 0  # Padding token'i ekle

        # Tüm veriyi tokenize et
        self.tokenize_data()

        # Inverse vocab oluştur
        self.inverse_vocab = {v: k for k, v in self.vocab.items()}

        # Dynamically update vocab_size in config
        config.vocab_size = len(self.vocab)
        self.config = config

    def tokenize_data(self):
        # Instruction ve Output'u tokenize et
        self.tokenized_instructions = []
        self.tokenized_outputs = []

        for inst, out in zip(self.instructions, self.outputs):
            inst_tokens = [self.vocab[word] for word in inst.split()]
            out_tokens = [self.vocab[word] for word in out.split()]
            self.tokenized_instructions.append(inst_tokens)
            self.tokenized_outputs.append(out_tokens)

    def get_batch(self, batch_size=4):
        # Rastgele bir batch oluştur
        indices = torch.randint(0, len(self.tokenized_instructions), (batch_size,))
        inputs, targets = [], []

        for i in indices:
            input_tokens = self.tokenized_instructions[i][:-1]
            target_tokens = self.tokenized_outputs[i][1:]
            inputs.append(torch.tensor(input_tokens, dtype=torch.long))
            targets.append(torch.tensor(target_tokens, dtype=torch.long))

        # Padding işlemi
        inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)
        targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=0)
        return inputs, targets


#############################################
# Greedy Decode Function
#############################################
def greedy_decode(model, start_token, max_length, device, temperature=1.0):
    """
    Greedy decoding to generate sequences from a language model.

    Args:
        model: The language model to use for generation.
        start_token: The token ID to start decoding from.
        max_length: Maximum length of the generated sequence.
        device: The device (CPU/GPU) where the model resides.
        temperature: Sampling temperature (optional, default=1.0).

    Returns:
        List of generated token IDs.
    """
    model.eval()
    with torch.no_grad():
        input_token = torch.tensor([[start_token]], dtype=torch.long).to(device)
        generated_tokens = [start_token]

        for _ in range(max_length - 1):
            logits = model(input_token)
            next_token_logits = logits[:, -1, :] / temperature
            next_token = torch.argmax(next_token_logits, dim=-1).item()

            if next_token == 0:  # Stop if <pad> token is generated
                break

            generated_tokens.append(next_token)
            input_token = torch.cat([input_token, torch.tensor([[next_token]], dtype=torch.long).to(device)], dim=1)

    return generated_tokens


#############################################
# Geliştirilmiş Eğitim ve Değerlendirme
#############################################
def train_and_evaluate(model, config, epochs=10):
    device = next(model.parameters()).device
    dataset = TurkishAlpacaDataset(config)
    optimizer = optim.AdamW(model.parameters(), lr=1e-4)  # Learning rate artırıldı
    loss_fn = nn.CrossEntropyLoss(ignore_index=0)

    print(f"\n{'='*40}")
    print(f"🏁 {model.name} Eğitime Başlıyor...")
    print(f"🔢 Toplam Token Sayısı: {len(dataset.vocab)}")
    print(f"⚙️  Kullanılan Donanım: {'GPU' if device.type=='cuda' else 'CPU'}")
    print(f"{'='*40}\n")

    for epoch in range(epochs):
        model.train()
        inputs, targets = dataset.get_batch(batch_size=8)  # Batch size artırıldı
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        logits = model(inputs)
        logits = logits.view(-1, config.vocab_size)  # Reshape logits
        targets = targets.view(-1)                  # Reshape targets
        loss = loss_fn(logits, targets)
        loss.backward()
        optimizer.step()

        # Eğitim Metrikleri
        preds = torch.argmax(logits, dim=-1)
        mask = targets != 0
        correct = (preds[mask] == targets[mask]).sum().item()
        total = mask.sum().item()
        acc = correct / total if total > 0 else 0
        ppl = math.exp(loss.item())

        print(f"Epok {epoch+1}/{epochs} | "
              f"Kayıp: {loss.item():.3f} | "
              f"Doğruluk: {acc:.1%} | "
              f"Perplexity: {ppl:.2f}")

    # Son Değerlendirme
    model.eval()
    with torch.no_grad():
        inputs, targets = dataset.get_batch(batch_size=8)
        inputs, targets = inputs.to(device), targets.to(device)
        logits = model(inputs)
        logits = logits.view(-1, config.vocab_size)  # Reshape logits
        targets = targets.view(-1)                  # Reshape targets
        loss = loss_fn(logits, targets)

        # Metrik Hesaplama
        preds = torch.argmax(logits, dim=-1)
        mask = targets != 0
        correct = (preds[mask] == targets[mask]).sum().item()
        total = mask.sum().item()
        final_acc = correct / total if total > 0 else 0
        final_ppl = math.exp(loss.item())

        # Örnek Üretim
        start_word = dataset.instructions[0].split()[0]
        input_token = dataset.vocab[start_word]
        generated = greedy_decode(model, input_token, max_length=config.max_length, device=device)
        generated_sentence = ' '.join([dataset.inverse_vocab.get(t, "?") for t in generated])

    print(f"\n⭐ Final Performans ⭐")
    print(f"|{'Metric':<15}|{'Değer':<15}|")
    print(f"|{'-'*15}|{'-'*15}|")
    print(f"|{'Kayıp':<15}|{loss.item():.3f}|")
    print(f"|{'Doğruluk':<15}|{final_acc:.1%}|")
    print(f"|{'Perplexity':<15}|{final_ppl:.2f}|")
    print(f"\n🔮 Örnek Çıktı: {generated_sentence}")

    # Metrikleri döndür
    return {
        'parameters': sum(p.numel() for p in model.parameters()),
        'trainable_parameters': sum(p.numel() for p in model.parameters() if p.requires_grad),
        'loss': loss.item(),
        'accuracy': final_acc,
        'perplexity': final_ppl,
        'sample_outputs': [generated_sentence]
    }


#############################################
# PDF Oluşturma Fonksiyonu (reportlab ile)
#############################################
def save_results_to_pdf(metrics, model_name):
    # PDF dosyasını oluştur
    pdf_path = f"{model_name}_degerlendirme.pdf"
    c = canvas.Canvas(pdf_path, pagesize=A4)
    width, height = A4

    # Başlık
    c.setFont("Helvetica-Bold", 16)
    c.drawString(50, height - 50, f"Model Değerlendirme Raporu: {model_name}")

    # Metrikler
    c.setFont("Helvetica", 12)
    y = height - 80
    c.drawString(50, y, "📊 Performans Metrikleri")
    y -= 20
    c.drawString(50, y, f"Toplam Parametre Sayısı: {metrics['parameters']:,}")
    y -= 20
    c.drawString(50, y, f"Eğitilebilir Parametre Sayısı: {metrics['trainable_parameters']:,}")
    y -= 20
    c.drawString(50, y, f"Kayıp: {metrics['loss']:.3f}")
    y -= 20
    c.drawString(50, y, f"Doğruluk: {metrics['accuracy']:.1%}")
    y -= 20
    c.drawString(50, y, f"Perplexity: {metrics['perplexity']:.2f}")

    # Örnek Çıktılar
    y -= 30
    c.drawString(50, y, "🔮 Örnek Çıktılar")
    y -= 20
    for i, output in enumerate(metrics['sample_outputs']):
        c.drawString(50, y, f"Örnek {i+1}: {output}")
        y -= 20

    # PDF'i kaydet
    c.save()
    print(f"📄 {model_name} için rapor PDF olarak kaydedildi: {pdf_path}")


#############################################
# Dummy Transformer Model for Testing
#############################################
class TransformerModel(nn.Module):
    def __init__(self, config, attn_type, ffn_type):
        super().__init__()
        self.name = f"{attn_type}-{ffn_type}"
        self.embedding = nn.Embedding(config.vocab_size, config.emb_dim)
        self.transformer = nn.Transformer(
            d_model=config.emb_dim,
            nhead=config.n_heads,
            num_encoder_layers=config.n_layers,
            num_decoder_layers=config.n_layers,
            dim_feedforward=config.emb_dim * 4,
            dropout=config.dropout
        )
        self.fc_out = nn.Linear(config.emb_dim, config.vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x, x)
        x = self.fc_out(x)
        return x


if __name__ == "__main__":
    # Konfigürasyon
    class Config:
        def __init__(self):
            self.vocab_size = 100  # Bu değer dinamik olarak güncellenecek
            self.emb_dim = 256     # Embedding boyutu artırıldı
            self.max_length = 32   # Maksimum uzunluk artırıldı

            # Veri setinden maksimum uzunluğu hesapla
            dataset = load_dataset("TFLai/Turkish-Alpaca")
            instructions = dataset['train']['instruction'][:100]  # Limit to 100 samples
            outputs = dataset['train']['output'][:100]            # Limit to 100 samples

            instruction_lengths = [len(inst.split()) for inst in instructions]
            output_lengths = [len(out.split()) for out in outputs]

            max_instruction_length = max(instruction_lengths)
            max_output_length = max(output_lengths)

            # max_length'ı instruction ve output'un maksimum uzunluğuna göre ayarla
            self.max_length = max(max_instruction_length, max_output_length) + 10  # Ekstra pay bırak

            self.n_layers = 4      # Katman sayısı artırıldı
            self.n_heads = 8       # Head sayısı artırıldı
            self.dropout = 0.1
            self.norm_type = 'rmsnorm'
            self.num_experts = 2

    config = Config()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Test Edilecek Modeller
    experiments = [
        {'attn': 'standard', 'ffn': 'standard', 'name': 'Standart Model'},
        {'attn': 'rope', 'ffn': 'standard', 'name': 'RoPE Dikkat'},
        {'attn': 'alibi', 'ffn': 'moe', 'name': 'ALiBi + MoE'},
        {'attn': 'multiquery', 'ffn': 'moe', 'name': 'Multi-Query MoE'}
    ]

    # Deneyleri Çalıştır
    results = []
    for exp in experiments:
        print(f"\n{'='*40}")
        print(f"🧪 {exp['name']} Değerlendiriliyor...")
        print(f"{'='*40}")

        model = TransformerModel(config, exp['attn'], exp['ffn']).to(device)
        model.name = exp['name']

        # Eğitim ve Değerlendirme
        metrics = train_and_evaluate(model, config, epochs=20)  # Epoch sayısı artırıldı
        results.append((exp['name'], metrics))

        # PDF Raporu Oluştur
        save_results_to_pdf(metrics, exp['name'])

    # Tüm Sonuçları Karşılaştır
    print("\n📊 Tüm Modellerin Karşılaştırması:")
    print(f"|{'Model':<20}|{'Parametre':<10}|{'Doğruluk':<10}|{'Perplexity':<12}|")
    print(f"|{'-'*20}|{'-'*10}|{'-'*10}|{'-'*12}|")
    for name, metrics in results:
        print(f"|{name:<20}|{metrics['parameters']:<10,}|{metrics['accuracy']:<10.1%}|{metrics['perplexity']:<12.2f}|")

İşlem için CPU ve GPU yetersiz o yüzden bu uyarı geliyor