In [None]:
!nvidia-smi

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from datasets import load_dataset
import time
import tracemalloc
import matplotlib.pyplot as plt
from tqdm import tqdm
from huggingface_hub import login

# Add your Hugging Face token here (you can get it from https://huggingface.co/settings/tokens)
HUGGINGFACE_TOKEN = "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"  # Replace with your actual token

# Login to Hugging Face
try:
    login(token=HUGGINGFACE_TOKEN)
    print("Successfully logged in to Hugging Face")
except Exception as e:
    print(f"Failed to login to Hugging Face: {str(e)}")


In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from datasets import load_dataset
import time
import tracemalloc
import matplotlib.pyplot as plt
from tqdm import tqdm

# Multi-head Attention (MHA) sınıfı
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        Q = self.W_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)
        attn = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        output = self.W_o(context)
        return output

# Multi-head Latent Attention (MLA) sınıfı
class MultiHeadLatentAttention(nn.Module):
    def __init__(self, d_model, num_heads, latent_dim):
        super(MultiHeadLatentAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.latent_dim = latent_dim
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_kv_down = nn.Linear(d_model, latent_dim)
        self.W_k_up = nn.Linear(latent_dim, d_model)
        self.W_v_up = nn.Linear(latent_dim, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        Q = self.W_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        kv_latent = self.W_kv_down(x)
        K = self.W_k_up(kv_latent).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v_up(kv_latent).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)
        attn = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        output = self.W_o(context)
        return output

# Veri setini yükleme ve ön işlem (Düzeltildi)
def prepare_data(batch_size=32, seq_len=128, d_model=512):
    # IMDB veri setini yükle
    dataset = load_dataset("imdb", split="train[:1000]")  # İlk 1000 örnek
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    # Metinleri token'ize et ve tensöre çevir
    inputs = tokenizer(
        dataset["text"], 
        max_length=seq_len, 
        padding="max_length", 
        truncation=True, 
        return_tensors="pt"
    )
    
    # Embedding katmanı ile token'ları d_model boyutuna çevir
    embedding = nn.Embedding(tokenizer.vocab_size, d_model)
    input_ids = inputs["input_ids"]
    data = embedding(input_ids)
    
    # Batch'lere ayırırken tam bölünme sağla
    num_samples = data.size(0)
    num_batches = num_samples // batch_size
    data = data[:num_batches * batch_size]  # Kalan örnekleri düşür
    data = data.view(num_batches, batch_size, seq_len, d_model)
    
    print(f"Veri boyutu: {data.shape} (num_batches, batch_size, seq_len, d_model)")
    return data

# Performans testi
def run_performance_test(data, d_model=512, num_heads=8, latent_dims=[128, 64]):
    mha = MultiHeadAttention(d_model, num_heads)
    mla_models = {dim: MultiHeadLatentAttention(d_model, num_heads, dim) for dim in latent_dims}
    
    results = {"MHA": {"time": [], "memory": [], "output": None}}
    for dim in latent_dims:
        results[f"MLA_{dim}"] = {"time": [], "memory": [], "output": None}
    
    for batch in tqdm(data, desc="Batch'ler üzerinde test"):
        # MHA için
        tracemalloc.start()
        start_time = time.time()
        mha_output = mha(batch)
        end_time = time.time()
        _, peak_memory = tracemalloc.get_traced_memory()
        tracemalloc.stop()
        results["MHA"]["time"].append(end_time - start_time)
        results["MHA"]["memory"].append(peak_memory / 1024 / 1024)
        results["MHA"]["output"] = mha_output
        
        # MLA için farklı latent_dim'ler
        for dim, mla in mla_models.items():
            tracemalloc.start()
            start_time = time.time()
            mla_output = mla(batch)
            end_time = time.time()
            _, peak_memory = tracemalloc.get_traced_memory()
            tracemalloc.stop()
            results[f"MLA_{dim}"]["time"].append(end_time - start_time)
            results[f"MLA_{dim}"]["memory"].append(peak_memory / 1024 / 1024)
            results[f"MLA_{dim}"]["output"] = mla_output
    
    return results

# Sonuçları görselleştirme
def visualize_results(results):
    labels = list(results.keys())
    avg_times = [sum(results[label]["time"]) / len(results[label]["time"]) for label in labels]
    avg_memories = [sum(results[label]["memory"]) / len(results[label]["memory"]) for label in labels]
    
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.bar(labels, avg_times, color=["blue", "orange", "green"])
    plt.title("Ortalama Çalışma Süresi (saniye)")
    plt.ylabel("Süre (s)")
    
    plt.subplot(1, 2, 2)
    plt.bar(labels, avg_memories, color=["blue", "orange", "green"])
    plt.title("Ortalama Bellek Kullanımı (MB)")
    plt.ylabel("Bellek (MB)")
    
    plt.tight_layout()
    plt.show()
    
    mha_out = results["MHA"]["output"]
    for label in labels[1:]:
        mla_out = results[label]["output"]
        diff = torch.mean(torch.abs(mha_out - mla_out)).item()
        print(f"{label} ile MHA arasındaki ortalama çıktı farkı: {diff:.6f}")

# Ana fonksiyon
def main():
    print("Veri seti hazırlanıyor...")
    data = prepare_data(batch_size=32, seq_len=128, d_model=512)
    
    print("Performans testi yapılıyor...")
    results = run_performance_test(data, d_model=512, num_heads=8, latent_dims=[128, 64])
    
    print("\n=== Performans Sonuçları ===")
    for label in results:
        avg_time = sum(results[label]["time"]) / len(results[label]["time"])
        avg_memory = sum(results[label]["memory"]) / len(results[label]["memory"])
        print(f"{label}:")
        print(f" - Ortalama Çalışma Süresi: {avg_time:.4f} saniye")
        print(f" - Ortalama Bellek Kullanımı: {avg_memory:.2f} MB")
    
    print("\nGörselleştirme yapılıyor...")
    visualize_results(results)

if __name__ == "__main__":
    main()

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from datasets import load_dataset
import time
import tracemalloc
import matplotlib.pyplot as plt
from tqdm import tqdm

# Multi-head Attention (MHA)
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        Q = self.W_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)
        attn = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        return self.W_o(context)

# Multi-head Latent Attention (MLA)
class MultiHeadLatentAttention(nn.Module):
    def __init__(self, d_model, num_heads, latent_dim):
        super(MultiHeadLatentAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.latent_dim = latent_dim
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_kv_down = nn.Linear(d_model, latent_dim)
        self.W_k_up = nn.Linear(latent_dim, d_model)
        self.W_v_up = nn.Linear(latent_dim, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        Q = self.W_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        kv_latent = self.W_kv_down(x)
        K = self.W_k_up(kv_latent).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v_up(kv_latent).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)
        attn = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        return self.W_o(context)

# Sınıflandırma modeli
class SentimentClassifier(nn.Module):
    def __init__(self, d_model, num_heads, attention_type="MHA", latent_dim=None):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(30522, d_model)
        if attention_type == "MHA":
            self.attention = MultiHeadAttention(d_model, num_heads)
        elif attention_type == "MLA":
            self.attention = MultiHeadLatentAttention(d_model, num_heads, latent_dim)
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(d_model, 2)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.attention(x)
        x = x.transpose(1, 2)
        x = self.pool(x).squeeze(-1)
        return self.fc(x)

# Parametre sayısını hesaplama
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

# Veri setini hazırlama
def prepare_data(batch_size=32, seq_len=128):
    dataset = load_dataset("imdb", split="train[:1000]")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    inputs = tokenizer(
        dataset["text"], 
        max_length=seq_len, 
        padding="max_length", 
        truncation=True, 
        return_tensors="pt"
    )
    input_ids = inputs["input_ids"]
    labels = torch.tensor(dataset["label"])
    
    train_size = 800
    train_data = input_ids[:train_size]
    train_labels = labels[:train_size]
    test_data = input_ids[train_size:]
    test_labels = labels[train_size:]
    
    num_train_batches = train_size // batch_size
    train_data = train_data[:num_train_batches * batch_size].view(num_train_batches, batch_size, seq_len)
    train_labels = train_labels[:num_train_batches * batch_size].view(num_train_batches, batch_size)
    
    num_test_samples = test_data.size(0)
    num_test_batches = num_test_samples // batch_size
    test_data = test_data[:num_test_batches * batch_size].view(num_test_batches, batch_size, seq_len)
    test_labels = test_labels[:num_test_batches * batch_size].view(num_test_batches, batch_size)
    
    print(f"Eğitim veri boyutu: {train_data.shape}")
    print(f"Test veri boyutu: {test_data.shape}")
    return (train_data, train_labels), (test_data, test_labels)

# Modeli eğitme (Detaylı çıktı)
def train_model(model, train_data, train_labels, epochs=5, lr=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    results = {"loss": [], "accuracy": [], "time": [], "memory": []}
    for epoch in tqdm(range(epochs), desc="Epochs"):
        epoch_loss = 0
        epoch_correct = 0
        epoch_time = 0
        epoch_memory = 0
        total_samples = 0
        
        for batch_data, batch_labels in zip(train_data, train_labels):
            tracemalloc.start()
            start_time = time.time()
            
            optimizer.zero_grad()
            outputs = model(batch_data)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            
            end_time = time.time()
            _, peak_memory = tracemalloc.get_traced_memory()
            tracemalloc.stop()
            
            # Batch bazında doğruluk
            _, predicted = torch.max(outputs, 1)
            correct = (predicted == batch_labels).sum().item()
            batch_size = batch_labels.size(0)
            
            epoch_loss += loss.item()
            epoch_correct += correct
            epoch_time += end_time - start_time
            epoch_memory += peak_memory / 1024 / 1024
            total_samples += batch_size
        
        avg_loss = epoch_loss / len(train_data)
        accuracy = epoch_correct / total_samples
        avg_time = epoch_time / len(train_data)
        avg_memory = epoch_memory / len(train_data)
        
        results["loss"].append(avg_loss)
        results["accuracy"].append(accuracy)
        results["time"].append(avg_time)
        results["memory"].append(avg_memory)
        
        print(f"\nEpoch {epoch+1}/{epochs}:")
        print(f" - Ortalama Kayıp: {avg_loss:.4f}")
        print(f" - Doğruluk: {accuracy:.4f}")
        print(f" - Ortalama Batch Süresi: {avg_time:.4f} saniye")
        print(f" - Ortalama Bellek Kullanımı: {avg_memory:.2f} MB")
    
    return results

# Test etme (Detaylı çıktı)
def test_model(model, test_data, test_labels):
    correct = 0
    total = 0
    test_loss = 0
    criterion = nn.CrossEntropyLoss()
    
    with torch.no_grad():
        for batch_data, batch_labels in zip(test_data, test_labels):
            outputs = model(batch_data)
            loss = criterion(outputs, batch_labels)
            _, predicted = torch.max(outputs, 1)
            total += batch_labels.size(0)
            correct += (predicted == batch_labels).sum().item()
            test_loss += loss.item()
    
    accuracy = correct / total
    avg_loss = test_loss / len(test_data)
    print(f"\nTest Sonuçları:")
    print(f" - Ortalama Kayıp: {avg_loss:.4f}")
    print(f" - Doğruluk: {accuracy:.4f}")
    print(f" - Toplam Örnek Sayısı: {total}")
    return accuracy, avg_loss

# Ana fonksiyon
def main():
    batch_size, seq_len, d_model, num_heads = 32, 128, 512, 8
    latent_dim = 128
    
    print("Veri seti hazırlanıyor...")
    (train_data, train_labels), (test_data, test_labels) = prepare_data(batch_size, seq_len)
    
    # Modelleri oluştur
    print("\n=== MHA Modeli ===")
    mha_model = SentimentClassifier(d_model, num_heads, "MHA")
    mha_total_params, mha_trainable_params = count_parameters(mha_model)
    print(f"Toplam Parametre Sayısı: {mha_total_params:,}")
    print(f"Eğitilebilir Parametre Sayısı: {mha_trainable_params:,}")
    
    print("\n=== MLA Modeli ===")
    mla_model = SentimentClassifier(d_model, num_heads, "MLA", latent_dim)
    mla_total_params, mla_trainable_params = count_parameters(mla_model)
    print(f"Toplam Parametre Sayısı: {mla_total_params:,}")
    print(f"Eğitilebilir Parametre Sayısı: {mla_trainable_params:,}")
    
    print("\nMHA modeli eğitiliyor...")
    mha_results = train_model(mha_model, train_data, train_labels)
    mha_accuracy, mha_test_loss = test_model(mha_model, test_data, test_labels)
    
    print("\nMLA modeli eğitiliyor...")
    mla_results = train_model(mla_model, train_data, train_labels)
    mla_accuracy, mla_test_loss = test_model(mla_model, test_data, test_labels)
    
    # Sonuçları görselleştir
    epochs = range(1, 6)
    plt.figure(figsize=(15, 10))
    
    plt.subplot(2, 2, 1)
    plt.plot(epochs, mha_results["loss"], label="MHA")
    plt.plot(epochs, mla_results["loss"], label="MLA")
    plt.title("Eğitim Kaybı")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    
    plt.subplot(2, 2, 2)
    plt.plot(epochs, mha_results["accuracy"], label="MHA")
    plt.plot(epochs, mla_results["accuracy"], label="MLA")
    plt.title("Eğitim Doğruluğu")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    
    plt.subplot(2, 2, 3)
    plt.plot(epochs, mha_results["time"], label="MHA")
    plt.plot(epochs, mla_results["time"], label="MLA")
    plt.title("Ortalama Batch Süresi (s)")
    plt.xlabel("Epoch")
    plt.ylabel("Süre (s)")
    plt.legend()
    
    plt.subplot(2, 2, 4)
    plt.plot(epochs, mha_results["memory"], label="MHA")
    plt.plot(epochs, mla_results["memory"], label="MLA")
    plt.title("Ortalama Bellek Kullanımı (MB)")
    plt.xlabel("Epoch")
    plt.ylabel("Bellek (MB)")
    plt.legend()
    
    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    main()

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from datasets import load_dataset
import time
import tracemalloc
import matplotlib.pyplot as plt
from tqdm import tqdm

# Multi-head Attention (MHA)
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        Q = self.W_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)
        attn = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        return self.W_o(context)

# Multi-head Latent Attention (MLA)
class MultiHeadLatentAttention(nn.Module):
    def __init__(self, d_model, num_heads, latent_dim):
        super(MultiHeadLatentAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.latent_dim = latent_dim
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_kv_down = nn.Linear(d_model, latent_dim)
        self.W_k_up = nn.Linear(latent_dim, d_model)
        self.W_v_up = nn.Linear(latent_dim, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        Q = self.W_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        kv_latent = self.W_kv_down(x)
        K = self.W_k_up(kv_latent).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v_up(kv_latent).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)
        attn = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        return self.W_o(context)

# Sınıflandırma modeli
class SentimentClassifier(nn.Module):
    def __init__(self, d_model, num_heads, attention_type="MHA", latent_dim=None):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(30522, d_model)
        if attention_type == "MHA":
            self.attention = MultiHeadAttention(d_model, num_heads)
        elif attention_type == "MLA":
            self.attention = MultiHeadLatentAttention(d_model, num_heads, latent_dim)
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(d_model, 2)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.attention(x)
        x = x.transpose(1, 2)
        x = self.pool(x).squeeze(-1)
        return self.fc(x)

# Parametre sayısını hesaplama
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

# Veri setini hazırlama (Validasyon seti eklendi)
def prepare_data(batch_size=32, seq_len=128):
    dataset = load_dataset("imdb", split="train[:1000]")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    inputs = tokenizer(
        dataset["text"], 
        max_length=seq_len, 
        padding="max_length", 
        truncation=True, 
        return_tensors="pt"
    )
    input_ids = inputs["input_ids"]
    labels = torch.tensor(dataset["label"])
    
    # Eğitim, validasyon ve test setine ayır
    train_size = 700
    val_size = 100
    test_size = 200
    
    train_data = input_ids[:train_size]
    train_labels = labels[:train_size]
    val_data = input_ids[train_size:train_size+val_size]
    val_labels = labels[train_size:train_size+val_size]
    test_data = input_ids[train_size+val_size:]
    test_labels = labels[train_size+val_size:]
    
    # Batch'lere ayır
    num_train_batches = train_size // batch_size
    train_data = train_data[:num_train_batches * batch_size].view(num_train_batches, batch_size, seq_len)
    train_labels = train_labels[:num_train_batches * batch_size].view(num_train_batches, batch_size)
    
    num_val_batches = val_size // batch_size
    val_data = val_data[:num_val_batches * batch_size].view(num_val_batches, batch_size, seq_len)
    val_labels = val_labels[:num_val_batches * batch_size].view(num_val_batches, batch_size)
    
    num_test_batches = test_size // batch_size
    test_data = test_data[:num_test_batches * batch_size].view(num_test_batches, batch_size, seq_len)
    test_labels = test_labels[:num_test_batches * batch_size].view(num_test_batches, batch_size)
    
    print(f"Eğitim veri boyutu: {train_data.shape}")
    print(f"Validasyon veri boyutu: {val_data.shape}")
    print(f"Test veri boyutu: {test_data.shape}")
    return (train_data, train_labels), (val_data, val_labels), (test_data, test_labels)

# Modeli eğitme ve validasyon (Batch bazında detaylı)
def train_model(model, train_data, train_labels, val_data, val_labels, epochs=5, lr=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    results = {
        "train_loss": [], "train_acc": [], "val_loss": [], "val_acc": [],
        "time": [], "memory": []
    }
    
    for epoch in tqdm(range(epochs), desc="Epochs"):
        model.train()
        epoch_train_loss = 0
        epoch_train_correct = 0
        epoch_time = 0
        epoch_memory = 0
        total_train_samples = 0
        
        print(f"\n=== Epoch {epoch+1}/{epochs} - Eğitim ===")
        for i, (batch_data, batch_labels) in enumerate(zip(train_data, train_labels)):
            tracemalloc.start()
            start_time = time.time()
            
            optimizer.zero_grad()
            outputs = model(batch_data)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            
            end_time = time.time()
            _, peak_memory = tracemalloc.get_traced_memory()
            tracemalloc.stop()
            
            _, predicted = torch.max(outputs, 1)
            correct = (predicted == batch_labels).sum().item()
            batch_size = batch_labels.size(0)
            
            epoch_train_loss += loss.item()
            epoch_train_correct += correct
            epoch_time += end_time - start_time
            epoch_memory += peak_memory / 1024 / 1024
            total_train_samples += batch_size
            
            print(f"Batch {i+1}/{len(train_data)} - Kayıp: {loss.item():.4f} - Doğruluk: {correct/batch_size:.4f}")
        
        # Validasyon
        model.eval()
        epoch_val_loss = 0
        epoch_val_correct = 0
        total_val_samples = 0
        
        with torch.no_grad():
            for batch_data, batch_labels in zip(val_data, val_labels):
                outputs = model(batch_data)
                loss = criterion(outputs, batch_labels)
                _, predicted = torch.max(outputs, 1)
                correct = (predicted == batch_labels).sum().item()
                batch_size = batch_labels.size(0)
                
                epoch_val_loss += loss.item()
                epoch_val_correct += correct
                total_val_samples += batch_size
        
        # Ortalamaları hesapla
        train_loss = epoch_train_loss / len(train_data)
        train_acc = epoch_train_correct / total_train_samples
        val_loss = epoch_val_loss / len(val_data)
        val_acc = epoch_val_correct / total_val_samples
        avg_time = epoch_time / len(train_data)
        avg_memory = epoch_memory / len(train_data)
        
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["val_loss"].append(val_loss)
        results["val_acc"].append(val_acc)
        results["time"].append(avg_time)
        results["memory"].append(avg_memory)
        
        print(f"\nEpoch {epoch+1}/{epochs} Özeti:")
        print(f" - Eğitim Kayıp: {train_loss:.4f} - Eğitim Doğruluk: {train_acc:.4f}")
        print(f" - Validasyon Kayıp: {val_loss:.4f} - Validasyon Doğruluk: {val_acc:.4f}")
        print(f" - Ortalama Batch Süresi: {avg_time:.4f} saniye")
        print(f" - Ortalama Bellek Kullanımı: {avg_memory:.2f} MB")
    
    return results

# Test etme
def test_model(model, test_data, test_labels):
    model.eval()
    correct = 0
    total = 0
    test_loss = 0
    criterion = nn.CrossEntropyLoss()
    
    with torch.no_grad():
        for i, (batch_data, batch_labels) in enumerate(zip(test_data, test_labels)):
            outputs = model(batch_data)
            loss = criterion(outputs, batch_labels)
            _, predicted = torch.max(outputs, 1)
            total += batch_labels.size(0)
            correct += (predicted == batch_labels).sum().item()
            test_loss += loss.item()
            print(f"Test Batch {i+1}/{len(test_data)} - Kayıp: {loss.item():.4f} - Doğruluk: {(predicted == batch_labels).sum().item()/batch_labels.size(0):.4f}")
    
    accuracy = correct / total
    avg_loss = test_loss / len(test_data)
    print(f"\nTest Özeti:")
    print(f" - Ortalama Kayıp: {avg_loss:.4f}")
    print(f" - Doğruluk: {accuracy:.4f}")
    print(f" - Toplam Örnek Sayısı: {total}")
    return accuracy, avg_loss

# Ana fonksiyon
def main():
    # Hiperparametreler
    batch_size, seq_len, d_model, num_heads = 32, 128, 512, 8
    epochs = 1
    lr = 0.0001  # Daha yavaş öğrenme için düşürüldü
    latent_dims = [64, 128, 256]  # Farklı latent_dim değerleri
    
    print("Veri seti hazırlanıyor...")
    (train_data, train_labels), (val_data, val_labels), (test_data, test_labels) = prepare_data(batch_size, seq_len)
    
    # Hiperparametreleri yazdır
    print("\n=== Hiperparametreler ===")
    print(f"Batch Size: {batch_size}")
    print(f"Sequence Length: {seq_len}")
    print(f"Model Dimension: {d_model}")
    print(f"Number of Heads: {num_heads}")
    print(f"Epochs: {epochs}")
    print(f"Learning Rate: {lr}")
    
    # MHA Modeli
    print("\n=== MHA Modeli ===")
    mha_model = SentimentClassifier(d_model, num_heads, "MHA")
    mha_total_params, mha_trainable_params = count_parameters(mha_model)
    print(f"Toplam Parametre Sayısı: {mha_total_params:,}")
    print(f"Eğitilebilir Parametre Sayısı: {mha_trainable_params:,}")
    
    print("\nMHA modeli eğitiliyor...")
    mha_results = train_model(mha_model, train_data, train_labels, val_data, val_labels, epochs, lr)
    mha_accuracy, mha_test_loss = test_model(mha_model, test_data, test_labels)
    
    # MLA Modelleri (farklı latent_dim'ler)
    mla_results = {}
    for latent_dim in latent_dims:
        print(f"\n=== MLA Modeli (latent_dim={latent_dim}) ===")
        mla_model = SentimentClassifier(d_model, num_heads, "MLA", latent_dim)
        mla_total_params, mla_trainable_params = count_parameters(mla_model)
        print(f"Toplam Parametre Sayısı: {mla_total_params:,}")
        print(f"Eğitilebilir Parametre Sayısı: {mla_trainable_params:,}")
        
        print(f"\nMLA modeli (latent_dim={latent_dim}) eğitiliyor...")
        mla_results[latent_dim] = train_model(mla_model, train_data, train_labels, val_data, val_labels, epochs, lr)
        mla_accuracy, mla_test_loss = test_model(mla_model, test_data, test_labels)
        mla_results[latent_dim]["test_acc"] = mla_accuracy
        mla_results[latent_dim]["test_loss"] = mla_test_loss
    
    # Sonuçları görselleştir
    epochs_range = range(1, epochs + 1)
    plt.figure(figsize=(15, 15))
    
    # Eğitim Kaybı
    plt.subplot(3, 2, 1)
    plt.plot(epochs_range, mha_results["train_loss"], label="MHA")
    for latent_dim in latent_dims:
        plt.plot(epochs_range, mla_results[latent_dim]["train_loss"], label=f"MLA_{latent_dim}")
    plt.title("Eğitim Kaybı")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    
    # Eğitim Doğruluğu
    plt.subplot(3, 2, 2)
    plt.plot(epochs_range, mha_results["train_acc"], label="MHA")
    for latent_dim in latent_dims:
        plt.plot(epochs_range, mla_results[latent_dim]["train_acc"], label=f"MLA_{latent_dim}")
    plt.title("Eğitim Doğruluğu")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    
    # Validasyon Kaybı
    plt.subplot(3, 2, 3)
    plt.plot(epochs_range, mha_results["val_loss"], label="MHA")
    for latent_dim in latent_dims:
        plt.plot(epochs_range, mla_results[latent_dim]["val_loss"], label=f"MLA_{latent_dim}")
    plt.title("Validasyon Kaybı")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    
    # Validasyon Doğruluğu
    plt.subplot(3, 2, 4)
    plt.plot(epochs_range, mha_results["val_acc"], label="MHA")
    for latent_dim in latent_dims:
        plt.plot(epochs_range, mla_results[latent_dim]["val_acc"], label=f"MLA_{latent_dim}")
    plt.title("Validasyon Doğruluğu")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    
    # Ortalama Batch Süresi
    plt.subplot(3, 2, 5)
    plt.plot(epochs_range, mha_results["time"], label="MHA")
    for latent_dim in latent_dims:
        plt.plot(epochs_range, mla_results[latent_dim]["time"], label=f"MLA_{latent_dim}")
    plt.title("Ortalama Batch Süresi (s)")
    plt.xlabel("Epoch")
    plt.ylabel("Süre (s)")
    plt.legend()
    
    # Ortalama Bellek Kullanımı
    plt.subplot(3, 2, 6)
    plt.plot(epochs_range, mha_results["memory"], label="MHA")
    for latent_dim in latent_dims:
        plt.plot(epochs_range, mla_results[latent_dim]["memory"], label=f"MLA_{latent_dim}")
    plt.title("Ortalama Bellek Kullanımı (MB)")
    plt.xlabel("Epoch")
    plt.ylabel("Bellek (MB)")
    plt.legend()
    
    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    main()

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from datasets import load_dataset
import time
import tracemalloc
import matplotlib.pyplot as plt
from tqdm import tqdm

# RoPE için düzeltilmiş yardımcı fonksiyon
def apply_rotary_pos_emb(q, k, seq_len, dim):
    theta = torch.arange(0, dim, 2, dtype=torch.float, device=q.device) / dim
    theta = 10000 ** (-theta)
    positions = torch.arange(seq_len, dtype=torch.float, device=q.device).unsqueeze(1)
    angles = positions * theta.unsqueeze(0)
    sin_angles = torch.sin(angles).unsqueeze(0).unsqueeze(0)  # [1, 1, seq_len, dim/2]
    cos_angles = torch.cos(angles).unsqueeze(0).unsqueeze(0)
    
    # q ve k'nın yarısını döndürmek için ayır
    q_reshape = q.reshape(*q.shape[:-1], -1, 2)  # [batch, heads, seq_len, d_k/2, 2]
    k_reshape = k.reshape(*k.shape[:-1], -1, 2)
    
    q_rot = torch.cat([
        q_reshape[..., 0] * cos_angles - q_reshape[..., 1] * sin_angles,
        q_reshape[..., 0] * sin_angles + q_reshape[..., 1] * cos_angles
    ], dim=-1)
    k_rot = torch.cat([
        k_reshape[..., 0] * cos_angles - k_reshape[..., 1] * sin_angles,
        k_reshape[..., 0] * sin_angles + k_reshape[..., 1] * cos_angles
    ], dim=-1)
    
    return q_rot.view_as(q), k_rot.view_as(k)

# Multi-head Attention (MHA)
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, use_rope=False, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.use_rope = use_rope
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        Q = self.W_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        
        if self.use_rope:
            Q, K = apply_rotary_pos_emb(Q, K, seq_len, self.d_k)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        context = torch.matmul(attn, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        return self.W_o(context)

# Multi-head Latent Attention (MLA)
class MultiHeadLatentAttention(nn.Module):
    def __init__(self, d_model, num_heads, latent_dim, use_rope=False, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.latent_dim = latent_dim
        self.use_rope = use_rope
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_kv_down = nn.Linear(d_model, latent_dim)
        self.W_k_up = nn.Linear(latent_dim, d_model)
        self.W_v_up = nn.Linear(latent_dim, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        Q = self.W_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        kv_latent = self.W_kv_down(x)
        K = self.W_k_up(kv_latent).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v_up(kv_latent).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        
        if self.use_rope:
            Q, K = apply_rotary_pos_emb(Q, K, seq_len, self.d_k)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        context = torch.matmul(attn, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        return self.W_o(context)

# Longformer Attention (Global token desteğiyle geliştirilmiş)
class LongformerAttention(nn.Module):
    def __init__(self, d_model, num_heads, window_size=4, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.window_size = window_size
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        Q = self.W_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        
        # CLS token (ilk token) global olsun
        scores = torch.zeros(batch_size, self.num_heads, seq_len, seq_len, device=x.device)
        for i in range(seq_len):
            if i == 0:  # CLS token global dikkat alır
                scores[:, :, 0, :] = torch.matmul(Q[:, :, 0:1], K.transpose(-2, -1)) / (self.d_k ** 0.5)
            else:  # Pencere bazlı dikkat
                start = max(0, i - self.window_size)
                end = min(seq_len, i + self.window_size + 1)
                scores[:, :, i, start:end] = torch.matmul(Q[:, :, i:i+1], K[:, :, start:end].transpose(-2, -1)) / (self.d_k ** 0.5)
        
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        context = torch.matmul(attn, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        return self.W_o(context)

# State Space Model (SSM)
class SimpleSSM(nn.Module):
    def __init__(self, d_model, state_dim=16, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.state_dim = state_dim
        self.A = nn.Parameter(torch.randn(state_dim, state_dim))
        self.B = nn.Linear(d_model, state_dim)
        self.C = nn.Linear(state_dim, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        state = torch.zeros(batch_size, self.state_dim, device=x.device)
        outputs = []
        
        for t in range(seq_len):
            state = state @ self.A + self.B(x[:, t])
            output = self.C(state)
            outputs.append(output)
        
        return torch.stack(outputs, dim=1)

# Sınıflandırma modeli
class SentimentClassifier(nn.Module):
    def __init__(self, d_model, num_heads, layers_config, latent_dim=None):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(30522, d_model)
        self.layers = nn.ModuleList()
        
        for layer_type in layers_config:
            if layer_type == "MHA":
                self.layers.append(MultiHeadAttention(d_model, num_heads, use_rope=True))
            elif layer_type == "MLA":
                self.layers.append(MultiHeadLatentAttention(d_model, num_heads, latent_dim, use_rope=True))
            elif layer_type == "Longformer":
                self.layers.append(LongformerAttention(d_model, num_heads))
            elif layer_type == "SSM":
                self.layers.append(SimpleSSM(d_model))
        
        self.fc = nn.Linear(d_model, 2)
        
    def forward(self, x):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x)
        cls_output = x[:, 0]  # CLS token
        return self.fc(cls_output)

# Parametre sayısını hesaplama
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

# Veri setini hazırlama (Daha büyük veri seti)
def prepare_data(batch_size=32, seq_len=128):
    dataset = load_dataset("imdb", split="train[:5000]")  # Daha büyük veri seti
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    inputs = tokenizer(
        ["[CLS] " + text for text in dataset["text"]], 
        max_length=seq_len, 
        padding="max_length", 
        truncation=True, 
        return_tensors="pt"
    )
    input_ids = inputs["input_ids"]
    labels = torch.tensor(dataset["label"])
    
    train_size = 4000
    val_size = 500
    test_size = 500
    
    train_data = input_ids[:train_size]
    train_labels = labels[:train_size]
    val_data = input_ids[train_size:train_size+val_size]
    val_labels = labels[train_size:train_size+val_size]
    test_data = input_ids[train_size+val_size:]
    test_labels = labels[train_size+val_size:]
    
    num_train_batches = train_size // batch_size
    train_data = train_data[:num_train_batches * batch_size].view(num_train_batches, batch_size, seq_len)
    train_labels = train_labels[:num_train_batches * batch_size].view(num_train_batches, batch_size)
    
    num_val_batches = val_size // batch_size
    val_data = val_data[:num_val_batches * batch_size].view(num_val_batches, batch_size, seq_len)
    val_labels = val_labels[:num_val_batches * batch_size].view(num_val_batches, batch_size)
    
    num_test_batches = test_size // batch_size
    test_data = test_data[:num_test_batches * batch_size].view(num_test_batches, batch_size, seq_len)
    test_labels = test_labels[:num_test_batches * batch_size].view(num_test_batches, batch_size)
    
    print(f"Eğitim veri boyutu: {train_data.shape}")
    print(f"Validasyon veri boyutu: {val_data.shape}")
    print(f"Test veri boyutu: {test_data.shape}")
    return (train_data, train_labels), (val_data, val_labels), (test_data, test_labels)

# Modeli eğitme
def train_model(model, train_data, train_labels, val_data, val_labels, epochs=5, lr=0.0001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    results = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": [], "time": [], "memory": []}
    
    for epoch in tqdm(range(epochs), desc="Epochs"):
        model.train()
        epoch_train_loss = 0
        epoch_train_correct = 0
        epoch_time = 0
        epoch_memory = 0
        total_train_samples = 0
        
        print(f"\n=== Epoch {epoch+1}/{epochs} - Eğitim ===")
        for i, (batch_data, batch_labels) in enumerate(zip(train_data, train_labels)):
            tracemalloc.start()
            start_time = time.time()
            
            optimizer.zero_grad()
            outputs = model(batch_data)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            
            end_time = time.time()
            _, peak_memory = tracemalloc.get_traced_memory()
            tracemalloc.stop()
            
            _, predicted = torch.max(outputs, 1)
            correct = (predicted == batch_labels).sum().item()
            batch_size = batch_labels.size(0)
            
            epoch_train_loss += loss.item()
            epoch_train_correct += correct
            epoch_time += end_time - start_time
            epoch_memory += peak_memory / 1024 / 1024
            total_train_samples += batch_size
            
            print(f"Batch {i+1}/{len(train_data)} - Kayıp: {loss.item():.4f} - Doğruluk: {correct/batch_size:.4f}")
        
        model.eval()
        epoch_val_loss = 0
        epoch_val_correct = 0
        total_val_samples = 0
        
        with torch.no_grad():
            for batch_data, batch_labels in zip(val_data, val_labels):
                outputs = model(batch_data)
                loss = criterion(outputs, batch_labels)
                _, predicted = torch.max(outputs, 1)
                correct = (predicted == batch_labels).sum().item()
                batch_size = batch_labels.size(0)
                
                epoch_val_loss += loss.item()
                epoch_val_correct += correct
                total_val_samples += batch_size
        
        train_loss = epoch_train_loss / len(train_data)
        train_acc = epoch_train_correct / total_train_samples
        val_loss = epoch_val_loss / len(val_data)
        val_acc = epoch_val_correct / total_val_samples
        avg_time = epoch_time / len(train_data)
        avg_memory = epoch_memory / len(train_data)
        
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["val_loss"].append(val_loss)
        results["val_acc"].append(val_acc)
        results["time"].append(avg_time)
        results["memory"].append(avg_memory)
        
        print(f"\nEpoch {epoch+1}/{epochs} Özeti:")
        print(f" - Eğitim Kayıp: {train_loss:.4f} - Eğitim Doğruluk: {train_acc:.4f}")
        print(f" - Validasyon Kayıp: {val_loss:.4f} - Validasyon Doğruluk: {val_acc:.4f}")
        print(f" - Ortalama Batch Süresi: {avg_time:.4f} saniye")
        print(f" - Ortalama Bellek Kullanımı: {avg_memory:.2f} MB")
    
    return results

# Test etme
def test_model(model, test_data, test_labels):
    model.eval()
    correct = 0
    total = 0
    test_loss = 0
    criterion = nn.CrossEntropyLoss()
    
    with torch.no_grad():
        for i, (batch_data, batch_labels) in enumerate(zip(test_data, test_labels)):
            outputs = model(batch_data)
            loss = criterion(outputs, batch_labels)
            _, predicted = torch.max(outputs, 1)
            total += batch_labels.size(0)
            correct += (predicted == batch_labels).sum().item()
            test_loss += loss.item()
            print(f"Test Batch {i+1}/{len(test_data)} - Kayıp: {loss.item():.4f} - Doğruluk: {(predicted == batch_labels).sum().item()/batch_labels.size(0):.4f}")
    
    accuracy = correct / total
    avg_loss = test_loss / len(test_data)
    print(f"\nTest Özeti:")
    print(f" - Ortalama Kayıp: {avg_loss:.4f}")
    print(f" - Doğruluk: {accuracy:.4f}")
    print(f" - Toplam Örnek Sayısı: {total}")
    return accuracy, avg_loss

# Ana fonksiyon
def main():
    batch_size, seq_len, d_model, num_heads = 32, 128, 512, 8
    epochs = 1
    lr = 0.0001
    latent_dims = [64, 128]
    
    print("Veri seti hazırlanıyor...")
    (train_data, train_labels), (val_data, val_labels), (test_data, test_labels) = prepare_data(batch_size, seq_len)
    
    print("\n=== Hiperparametreler ===")
    print(f"Batch Size: {batch_size}")
    print(f"Sequence Length: {seq_len}")
    print(f"Model Dimension: {d_model}")
    print(f"Number of Heads: {num_heads}")
    print(f"Epochs: {epochs}")
    print(f"Learning Rate: {lr}")
    
    models = {
        "MHA": SentimentClassifier(d_model, num_heads, ["MHA", "MHA"]),
        "MLA_64": SentimentClassifier(d_model, num_heads, ["MLA", "MHA"], latent_dim=64),
        "MLA_128": SentimentClassifier(d_model, num_heads, ["MLA", "MHA"], latent_dim=128),
        "Longformer": SentimentClassifier(d_model, num_heads, ["Longformer", "MHA"]),
        "SSM_Hybrid": SentimentClassifier(d_model, num_heads, ["SSM", "MHA"])
    }
    
    results = {}
    for name, model in models.items():
        print(f"\n=== {name} Modeli ===")
        total_params, trainable_params = count_parameters(model)
        print(f"Toplam Parametre Sayısı: {total_params:,}")
        print(f"Eğitilebilir Parametre Sayısı: {trainable_params:,}")
        
        print(f"\n{name} modeli eğitiliyor...")
        results[name] = train_model(model, train_data, train_labels, val_data, val_labels, epochs, lr)
        accuracy, test_loss = test_model(model, test_data, test_labels)
        results[name]["test_acc"] = accuracy
        results[name]["test_loss"] = test_loss
    
    # Görselleştirme
    epochs_range = range(1, epochs + 1)
    plt.figure(figsize=(15, 15))
    
    for i, (metric, title) in enumerate([
        ("train_loss", "Eğitim Kaybı"), ("train_acc", "Eğitim Doğruluğu"),
        ("val_loss", "Validasyon Kaybı"), ("val_acc", "Validasyon Doğruluğu"),
        ("time", "Ortalama Batch Süresi (s)"), ("memory", "Ortalama Bellek Kullanımı (MB)")
    ], 1):
        plt.subplot(3, 2, i)
        for name in models.keys():
            plt.plot(epochs_range, results[name][metric], label=name)
        plt.title(title)
        plt.xlabel("Epoch")
        plt.ylabel(title.split()[-1])
        plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Yorumlar ve Öneriler
    print("\n=== Yorumlar ve Öneriler ===")
    print("1. Performans: SSM ve Longformer, uzun dizilerde avantaj sağlayabilir. Şu an 5000 örnek kullanıldı, farklar daha belirgin hale geldi.")
    print("   Daha büyük bir veri seti (örneğin tüm IMDB) ile daha iyi sonuçlar alınabilir.")
    print("2. Karmaşıklık: Katman yığma ve SSM eklenmesi parametre sayısını artırdı, dropout (0.1) ile overfitting önlenmeye çalışıldı.")
    print("   Daha fazla regularization (örneğin weight decay) düşünülebilir.")
    print("3. İyileştirme: LongformerAttention, CLS token’a global dikkat ile geliştirildi. Daha gerçekçi bir Longformer için dilated attention eklenebilir.")

if __name__ == "__main__":
    main()

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer
from datasets import load_dataset
import time
import tracemalloc
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# RoPE için yardımcı fonksiyon
def apply_rotary_pos_emb(q, k, seq_len, dim):
    theta = torch.arange(0, dim, 2, dtype=torch.float, device=q.device) / dim
    theta = 10000 ** (-theta)
    positions = torch.arange(seq_len, dtype=torch.float, device=q.device).unsqueeze(1)
    angles = positions * theta.unsqueeze(0)
    sin_angles = torch.sin(angles).unsqueeze(0).unsqueeze(0)
    cos_angles = torch.cos(angles).unsqueeze(0).unsqueeze(0)
    
    q_reshape = q.reshape(*q.shape[:-1], -1, 2)
    k_reshape = k.reshape(*k.shape[:-1], -1, 2)
    
    q_rot = torch.cat([
        q_reshape[..., 0] * cos_angles - q_reshape[..., 1] * sin_angles,
        q_reshape[..., 0] * sin_angles + q_reshape[..., 1] * cos_angles
    ], dim=-1)
    k_rot = torch.cat([
        k_reshape[..., 0] * cos_angles - k_reshape[..., 1] * sin_angles,
        k_reshape[..., 0] * sin_angles + k_reshape[..., 1] * cos_angles
    ], dim=-1)
    
    return q_rot.view_as(q), k_rot.view_as(k)

# Multi-head Latent Attention (MLA)
class MultiHeadLatentAttention(nn.Module):
    def __init__(self, d_model, num_heads, latent_dim, dropout=0.1):
        super(MultiHeadLatentAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.latent_dim = latent_dim
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_kv_down = nn.Linear(d_model, latent_dim)
        self.W_k_up = nn.Linear(latent_dim, d_model)
        self.W_v_up = nn.Linear(latent_dim, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.norm = nn.LayerNorm(d_model)
        
    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        x = self.norm(x)
        Q = self.W_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        kv_latent = self.W_kv_down(x)
        K = self.W_k_up(kv_latent).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v_up(kv_latent).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        
        Q, K = apply_rotary_pos_emb(Q, K, seq_len, self.d_k)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        context = torch.matmul(attn, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        return self.W_o(context) + x  # Residual connection

# State Space Model (SSM)
class SimpleSSM(nn.Module):
    def __init__(self, d_model, state_dim=16, dropout=0.1):
        super(SimpleSSM, self).__init__()
        self.d_model = d_model
        self.state_dim = state_dim
        self.A = nn.Parameter(torch.randn(state_dim, state_dim) * 0.01)
        self.B = nn.Linear(d_model, state_dim)
        self.C = nn.Linear(state_dim, d_model)
        self.dropout = nn.Dropout(dropout)
        self.norm = nn.LayerNorm(d_model)
        
    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        x = self.norm(x)
        state = torch.zeros(batch_size, self.state_dim, device=x.device)
        outputs = []
        
        for t in range(seq_len):
            state = state @ self.A + self.B(x[:, t])
            output = self.C(state)
            outputs.append(output)
        
        output = torch.stack(outputs, dim=1)
        return self.dropout(output) + x  # Residual connection

# LLM Modeli
class CustomLLM(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, latent_dim, num_layers=2, dropout=0.1):
        super(CustomLLM, self).__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        
        # Düzgün bir liste oluştur
        layers = []
        for i in range(num_layers):
            if i % 2 == 0:
                layers.append(MultiHeadLatentAttention(d_model, num_heads, latent_dim, dropout))
            else:
                # Fixed: Corrected the parameter order
                layers.append(SimpleSSM(d_model, state_dim=latent_dim, dropout=dropout))
        self.layers = nn.ModuleList(layers)
        
        self.norm = nn.LayerNorm(d_model)
        self.fc = nn.Linear(d_model, 2)  # Duygu analizi için 2 sınıf
        
    def forward(self, x):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x)
        x = self.norm(x)
        cls_output = x[:, 0]  # CLS token
        return self.fc(cls_output)

# Parametre sayısını hesaplama
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

# Veri setini hazırlama
def prepare_data(batch_size=32, seq_len=128):
    dataset = load_dataset("imdb", split="train[:5000]")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    inputs = tokenizer(
        ["[CLS] " + text for text in dataset["text"]], 
        max_length=seq_len, 
        padding="max_length", 
        truncation=True, 
        return_tensors="pt"
    )
    input_ids = inputs["input_ids"]
    labels = torch.tensor(dataset["label"])
    
    train_size = 4000
    val_size = 500
    test_size = 500
    
    train_data = input_ids[:train_size]
    train_labels = labels[:train_size]
    val_data = input_ids[train_size:train_size + val_size]
    val_labels = labels[train_size:train_size + val_size]
    test_data = input_ids[train_size + val_size:]
    test_labels = labels[train_size + val_size:]
    
    num_train_batches = train_size // batch_size
    train_data = train_data[:num_train_batches * batch_size].view(num_train_batches, batch_size, seq_len)
    train_labels = train_labels[:num_train_batches * batch_size].view(num_train_batches, batch_size)
    
    num_val_batches = val_size // batch_size
    val_data = val_data[:num_val_batches * batch_size].view(num_val_batches, batch_size, seq_len)
    val_labels = val_labels[:num_val_batches * batch_size].view(num_val_batches, batch_size)
    
    num_test_batches = test_size // batch_size
    test_data = test_data[:num_test_batches * batch_size].view(num_test_batches, batch_size, seq_len)
    test_labels = test_labels[:num_test_batches * batch_size].view(num_test_batches, batch_size)
    test_texts = dataset["text"][train_size + val_size:train_size + val_size + num_test_batches * batch_size]
    
    # Veri seti dağılımını kontrol et
    print(f"Eğitim veri boyutu: {train_data.shape}, Etiket Dağılımı: {torch.bincount(train_labels.flatten())}")
    print(f"Validasyon veri boyutu: {val_data.shape}, Etiket Dağılımı: {torch.bincount(val_labels.flatten())}")
    print(f"Test veri boyutu: {test_data.shape}, Etiket Dağılımı: {torch.bincount(test_labels.flatten())}")
    return (train_data, train_labels), (val_data, val_labels), (test_data, test_labels, test_texts), tokenizer.vocab_size, tokenizer

# Modeli eğitme
def train_model(model, train_data, train_labels, val_data, val_labels, epochs=5, lr=0.0001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    results = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": [], "time": [], "memory": []}
    
    for epoch in tqdm(range(epochs), desc="Epochs"):
        model.train()
        epoch_train_loss = 0
        epoch_train_correct = 0
        epoch_time = 0
        epoch_memory = 0
        total_train_samples = 0
        
        print(f"\n=== Epoch {epoch + 1}/{epochs} - Eğitim ===")
        for i, (batch_data, batch_labels) in enumerate(zip(train_data, train_labels)):
            tracemalloc.start()
            start_time = time.time()
            
            optimizer.zero_grad()
            outputs = model(batch_data)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()
            
            end_time = time.time()
            _, peak_memory = tracemalloc.get_traced_memory()
            tracemalloc.stop()
            
            _, predicted = torch.max(outputs, 1)
            correct = (predicted == batch_labels).sum().item()
            batch_size = batch_labels.size(0)
            
            epoch_train_loss += loss.item()
            epoch_train_correct += correct
            epoch_time += end_time - start_time
            epoch_memory += peak_memory / 1024 / 1024
            total_train_samples += batch_size
            
            if (i + 1) % 10 == 0:
                print(f"Batch {i + 1}/{len(train_data)} - Kayıp: {loss.item():.4f} - Doğruluk: {correct / batch_size:.4f}")
        
        model.eval()
        epoch_val_loss = 0
        epoch_val_correct = 0
        total_val_samples = 0
        
        with torch.no_grad():
            for batch_data, batch_labels in zip(val_data, val_labels):
                outputs = model(batch_data)
                loss = criterion(outputs, batch_labels)
                _, predicted = torch.max(outputs, 1)
                correct = (predicted == batch_labels).sum().item()
                batch_size = batch_labels.size(0)
                
                epoch_val_loss += loss.item()
                epoch_val_correct += correct
                total_val_samples += batch_size
        
        train_loss = epoch_train_loss / len(train_data)
        train_acc = epoch_train_correct / total_train_samples
        val_loss = epoch_val_loss / len(val_data)
        val_acc = epoch_val_correct / total_val_samples
        avg_time = epoch_time / len(train_data)
        avg_memory = epoch_memory / len(train_data)
        
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["val_loss"].append(val_loss)
        results["val_acc"].append(val_acc)
        results["time"].append(avg_time)
        results["memory"].append(avg_memory)
        
        print(f"\nEpoch {epoch + 1}/{epochs} Özeti:")
        print(f" - Eğitim Kayıp: {train_loss:.4f} - Eğitim Doğruluk: {train_acc:.4f}")
        print(f" - Validasyon Kayıp: {val_loss:.4f} - Validasyon Doğruluk: {val_acc:.4f}")
        print(f" - Ortalama Batch Süresi: {avg_time:.4f} saniye")
        print(f" - Ortalama Bellek Kullanımı: {avg_memory:.2f} MB")
    
    return results

# Test etme
def test_model(model, test_data, test_labels, test_texts, tokenizer, num_samples=5):
    model.eval()
    correct = 0
    total = 0
    test_loss = 0
    criterion = nn.CrossEntropyLoss()
    all_preds = []
    all_labels = []
    all_outputs = []
    
    with torch.no_grad():
        for i, (batch_data, batch_labels) in enumerate(zip(test_data, test_labels)):
            outputs = model(batch_data)
            loss = criterion(outputs, batch_labels)
            softmax_outputs = torch.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs, 1)
            total += batch_labels.size(0)
            correct += (predicted == batch_labels).sum().item()
            test_loss += loss.item()
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())
            all_outputs.extend(softmax_outputs.cpu().numpy())
            
            print(f"Test Batch {i + 1}/{len(test_data)} - Kayıp: {loss.item():.4f} - Doğruluk: {(predicted == batch_labels).sum().item() / batch_labels.size(0):.4f}")
    
    accuracy = correct / total
    avg_loss = test_loss / len(test_data)
    
    print(f"\nTest Özeti:")
    print(f" - Ortalama Kayıp: {avg_loss:.4f}")
    print(f" - Doğruluk: {accuracy:.4f}")
    print(f" - Toplam Örnek Sayısı: {total}")
    print(f" - Tahmin Dağılımı: {np.bincount(all_preds)}")
    print(f" - Gerçek Etiket Dağılımı: {np.bincount(all_labels)}")
    
    # Confusion Matrix
    cm = confusion_matrix(all_labels, all_preds)
    if len(np.unique(all_labels)) < 2 or len(np.unique(all_preds)) < 2:
        print("Uyarı: Test verisinde tek bir sınıf tahmin edildi veya mevcut, Confusion Matrix tam anlamıyla çizilemez.")
        print(f"Confusion Matrix: {cm}")
    else:
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negatif", "Pozitif"])
        fig, ax = plt.subplots()
        disp.plot(cmap=plt.cm.Blues, ax=ax)
        ax.set_xticks([0, 1])
        ax.set_yticks([0, 1])
        ax.set_xticklabels(["Negatif", "Pozitif"])
        ax.set_yticklabels(["Negatif", "Pozitif"])
        plt.title("Confusion Matrix")
        plt.show()
    
    # Örnek Tahminler
    print(f"\n=== Örnek Tahminler (İlk {num_samples} Örnek) ===")
    for i in range(min(num_samples, len(test_texts))):
        text = test_texts[i][:100] + "..." if len(test_texts[i]) > 100 else test_texts[i]
        true_label = "Pozitif" if all_labels[i] == 1 else "Negatif"
        pred_label = "Pozitif" if all_preds[i] == 1 else "Negatif"
        softmax_scores = all_outputs[i]
        print(f"Metin: {text}")
        print(f"Gerçek Etiket: {true_label} - Tahmin Edilen Etiket: {pred_label}")
        print(f"Softmax Skorları: Negatif: {softmax_scores[0]:.4f}, Pozitif: {softmax_scores[1]:.4f}")
        print("-" * 50)
    
    return accuracy, avg_loss

# Ana fonksiyon
def main():
    batch_size, seq_len, d_model, num_heads = 32, 128, 512, 8
    epochs = 2  # Daha fazla epoch için artırıldı
    lr = 0.0001
    latent_dim = 128
    num_layers = 4
    
    print("Veri seti hazırlanıyor...")
    (train_data, train_labels), (val_data, val_labels), (test_data, test_labels, test_texts), vocab_size, tokenizer = prepare_data(batch_size, seq_len)
    
    print("\n=== Hiperparametreler ===")
    print(f"Batch Size: {batch_size}")
    print(f"Sequence Length: {seq_len}")
    print(f"Model Dimension: {d_model}")
    print(f"Number of Heads: {num_heads}")
    print(f"Latent Dimension: {latent_dim}")
    print(f"Number of Layers: {num_layers}")
    print(f"Epochs: {epochs}")
    print(f"Learning Rate: {lr}")
    print(f"Vocabulary Size: {vocab_size}")
    
    # Modeli oluştur
    model = CustomLLM(vocab_size, d_model, num_heads, latent_dim, num_layers)
    total_params, trainable_params = count_parameters(model)
    print(f"\n=== CustomLLM Modeli ===")
    print(f"Toplam Parametre Sayısı: {total_params:,}")
    print(f"Eğitilebilir Parametre Sayısı: {trainable_params:,}")
    
    print("\nModel eğitiliyor...")
    results = train_model(model, train_data, train_labels, val_data, val_labels, epochs, lr)
    accuracy, test_loss = test_model(model, test_data, test_labels, test_texts, tokenizer)
    results["test_acc"] = accuracy
    results["test_loss"] = test_loss
    
    # Görselleştirme
    epochs_range = range(1, epochs + 1)
    plt.figure(figsize=(15, 15))
    
    for i, (metric, title) in enumerate([
        ("train_loss", "Eğitim Kaybı"), ("train_acc", "Eğitim Doğruluğu"),
        ("val_loss", "Validasyon Kaybı"), ("val_acc", "Validasyon Doğruluğu"),
        ("time", "Ortalama Batch Süresi (s)"), ("memory", "Ortalama Bellek Kullanımı (MB)")
    ], 1):
        plt.subplot(3, 2, i)
        plt.plot(epochs_range, results[metric], label="CustomLLM")
        plt.title(title)
        plt.xlabel("Epoch")
        plt.ylabel(title.split()[-1])
        plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Yorumlar ve Öneriler
    print("\n=== Yorumlar ve Öneriler ===")
    print("1. Performans: Model, test setinde yalnızca Negatif tahmin etti. Veri seti dağılımı kontrol edilmeli veya daha fazla epoch ile genelleme artırılmalı.")
    print("2. Karmaşıklık: Eğitim çok hızlı tamamlandı, öğrenme oranı (lr) artırılabilir (örneğin 0.001) veya model kapasitesi genişletilebilir.")
    print("3. İyileştirme: Softmax skorları incelenerek modelin sınıf偏見 (bias) durumu analiz edilmeli. Daha dengeli bir veri seti kullanılabilir.")

if __name__ == "__main__":
    main()