In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AutoConfig,
    get_linear_schedule_with_warmup
)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# GPU kontrolü
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Kullanılan cihaz: {device}")

class CustomerQueryDataset(Dataset):
    """Müşteri sorguları için özel dataset sınıfı"""
    
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenization
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class BERTClassifier(nn.Module):
    """BERT tabanlı sınıflandırıcı model"""
    
    def __init__(self, model_name, num_classes, dropout_rate=0.3):
        super(BERTClassifier, self).__init__()
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.bert = AutoModel.from_pretrained(model_name)
        
        # Classification head
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, input_ids, attention_mask):
        # BERT çıktısı
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # [CLS] token temsilini al
        pooled_output = outputs.pooler_output
        
        # Dropout ve sınıflandırma
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        return logits

class BERTTrainer:
    """BERT modelini eğitme ve değerlendirme sınıfı"""
    
    def __init__(self, model, train_loader, val_loader, num_classes, device):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.num_classes = num_classes
        self.device = device
        
        # Loss function
        self.criterion = nn.CrossEntropyLoss()
        
        # Optimizer
        self.optimizer = torch.optim.AdamW(
            self.model.parameters(),
            lr=2e-5,
            weight_decay=0.01
        )
        
        # Scheduler
        total_steps = len(train_loader) * 5  # 5 epoch varsayımı
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=total_steps // 10,
            num_training_steps=total_steps
        )
        
        # Metrics tracking
        self.train_losses = []
        self.val_losses = []
        self.val_accuracies = []
    
    def train_epoch(self):
        """Bir epoch eğitim"""
        self.model.train()
        total_loss = 0
        
        for batch in tqdm(self.train_loader, desc="Training"):
            # Veriyi GPU'ya taşı
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['labels'].to(self.device)
            
            # Forward pass
            self.optimizer.zero_grad()
            logits = self.model(input_ids, attention_mask)
            loss = self.criterion(logits, labels)
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            self.optimizer.step()
            self.scheduler.step()
            
            total_loss += loss.item()
        
        return total_loss / len(self.train_loader)
    
    def validate(self):
        """Validation seti üzerinde değerlendirme"""
        self.model.eval()
        total_loss = 0
        all_predictions = []
        all_labels = []
        
        with torch.no_grad():
            for batch in tqdm(self.val_loader, desc="Validation"):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                
                logits = self.model(input_ids, attention_mask)
                loss = self.criterion(logits, labels)
                
                total_loss += loss.item()
                
                # Predictions
                predictions = torch.argmax(logits, dim=1)
                all_predictions.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        val_loss = total_loss / len(self.val_loader)
        val_accuracy = accuracy_score(all_labels, all_predictions)
        
        return val_loss, val_accuracy, all_predictions, all_labels
    
    def train(self, epochs=5):
        """Model eğitimi"""
        print(f"Eğitim başlıyor - {epochs} epoch")
        print(f"Train batches: {len(self.train_loader)}")
        print(f"Validation batches: {len(self.val_loader)}")
        
        best_val_accuracy = 0
        
        for epoch in range(epochs):
            print(f"\n{'='*50}")
            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"{'='*50}")
            
            # Training
            train_loss = self.train_epoch()
            self.train_losses.append(train_loss)
            
            # Validation
            val_loss, val_accuracy, val_predictions, val_labels = self.validate()
            self.val_losses.append(val_loss)
            self.val_accuracies.append(val_accuracy)
            
            # Sonuçları yazdır
            print(f"Train Loss: {train_loss:.4f}")
            print(f"Val Loss: {val_loss:.4f}")
            print(f"Val Accuracy: {val_accuracy:.4f}")
            
            # En iyi modeli kaydet
            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                torch.save(self.model.state_dict(), 'best_bert_model.pth')
                print(f"En iyi model kaydedildi! Accuracy: {best_val_accuracy:.4f}")
        
        print(f"\nEğitim tamamlandı! En iyi validation accuracy: {best_val_accuracy:.4f}")
        return val_predictions, val_labels

def load_data():
    """Verileri yükle"""
    print("Veriler yükleniyor...")
    # Train data
    train_df = pd.read_csv('../data/processed/train_split.csv')
    print(f"Train verisi yüklendi: {len(train_df)} örnek")
    
    # Validation data
    val_df = pd.read_csv('../data/processed/val_split.csv')
    print(f"Validation verisi yüklendi: {len(val_df)} örnek")
    
    # Test data (eğer varsa)
    try:
        test_df = pd.read_csv('customer_queries_test.csv')
        print(f"Test verisi yüklendi: {len(test_df)} örnek")
    except:
        test_df = None
        print("Test verisi bulunamadı")
    
    return train_df, val_df, test_df

def create_data_loaders(train_df, val_df, tokenizer, batch_size=16, max_length=128):
    """DataLoader'ları oluştur"""
    print("DataLoader'lar oluşturuluyor...")
    
    # Datasets
    train_dataset = CustomerQueryDataset(
        train_df['query_original'].values,
        train_df['label_encoded'].values,
        tokenizer,
        max_length
    )
    
    val_dataset = CustomerQueryDataset(
        val_df['query_original'].values,
        val_df['label_encoded'].values,
        tokenizer,
        max_length
    )
    
    # DataLoaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2
    )
    
    return train_loader, val_loader

def plot_training_metrics(trainer):
    """Eğitim metriklerini görselleştir"""
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Loss grafiği
    epochs = range(1, len(trainer.train_losses) + 1)
    axes[0].plot(epochs, trainer.train_losses, 'b-', label='Train Loss')
    axes[0].plot(epochs, trainer.val_losses, 'r-', label='Validation Loss')
    axes[0].set_title('Model Loss')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].legend()
    axes[0].grid(True)
    
    # Accuracy grafiği
    axes[1].plot(epochs, trainer.val_accuracies, 'g-', label='Validation Accuracy')
    axes[1].set_title('Model Accuracy')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Accuracy')
    axes[1].legend()
    axes[1].grid(True)
    
    plt.tight_layout()
    plt.savefig('training_metrics.png', dpi=300, bbox_inches='tight')
    plt.show()

def evaluate_model(predictions, true_labels, num_classes):
    """Detaylı model değerlendirmesi"""
    print("\n" + "="*50)
    print("DETAYLI MODEL DEĞERLENDİRMESİ")
    print("="*50)
    
    # Genel metrikler
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Genel Doğruluk: {accuracy:.4f}")
    
    # Sınıf bazında rapor
    print("\nSınıf Bazında Performans:")
    class_report = classification_report(
        true_labels, 
        predictions, 
        target_names=[f"Class_{i}" for i in range(num_classes)],
        output_dict=True
    )
    
    # Sınıf performansını DataFrame olarak göster
    class_df = pd.DataFrame(class_report).transpose()
    print(class_df.round(3))
    
    # Confusion Matrix
    cm = confusion_matrix(true_labels, predictions)
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    return class_report

def predict_single_query(model, tokenizer, query, device, max_length=128):
    """Tek bir sorgu için tahmin"""
    model.eval()
    
    # Tokenize
    encoding = tokenizer(
        query,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        probabilities = torch.softmax(logits, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        confidence = probabilities[0][predicted_class].item()
    
    return predicted_class, confidence, probabilities[0].cpu().numpy()

def main():
    """Ana fonksiyon"""
    print("🤖 BERT Müşteri Sorgu Sınıflandırıcısı")
    print("="*50)
    
    # Parametreler
    MODEL_NAME = 'bert-base-uncased'  # veya 'distilbert-base-uncased' daha hızlı için
    MAX_LENGTH = 128
    BATCH_SIZE = 16
    EPOCHS = 5
    NUM_CLASSES = 30
    
    # 1. Verileri yükle
    train_df, val_df, test_df = load_data()
    
    # 2. Tokenizer yükle
    print(f"BERT tokenizer yükleniyor: {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # 3. DataLoader'ları oluştur
    train_loader, val_loader = create_data_loaders(
        train_df, val_df, tokenizer, BATCH_SIZE, MAX_LENGTH
    )
    
    # 4. Model oluştur
    print(f"BERT model oluşturuluyor: {MODEL_NAME}")
    model = BERTClassifier(MODEL_NAME, NUM_CLASSES)
    
    # 5. Trainer oluştur
    trainer = BERTTrainer(model, train_loader, val_loader, NUM_CLASSES, device)
    
    # 6. Model eğitimi
    val_predictions, val_labels = trainer.train(epochs=EPOCHS)
    
    # 7. Sonuçları görselleştir
    plot_training_metrics(trainer)
    
    # 8. Detaylı değerlendirme
    class_report = evaluate_model(val_predictions, val_labels, NUM_CLASSES)
    
    # 9. Örnek tahminler
    print("\n" + "="*50)
    print("ÖRNEK TAHMİNLER")
    print("="*50)
    
    example_queries = [
        "I need help with my booking",
        "How can I change my flight?",
        "Payment was declined",
        "App is not working",
        "Where is my luggage?"
    ]
    
    model.load_state_dict(torch.load('best_bert_model.pth'))
    
    for query in example_queries:
        predicted_class, confidence, probs = predict_single_query(
            model, tokenizer, query, device
        )
        print(f"Sorgu: '{query}'")
        print(f"Tahmin: Class_{predicted_class} (Güven: {confidence:.3f})")
        print("-" * 30)
    
    print("\n✅ Eğitim ve değerlendirme tamamlandı!")
    print("📁 Kaydedilen dosyalar:")
    print("   - best_bert_model.pth (En iyi model)")
    print("   - training_metrics.png (Eğitim grafikleri)")
    print("   - confusion_matrix.png (Karışıklık matrisi)")

if __name__ == "__main__":
    main()

Kullanılan cihaz: cpu
🤖 BERT Müşteri Sorgu Sınıflandırıcısı
Veriler yükleniyor...
Train verisi yüklendi: 16000 örnek
Validation verisi yüklendi: 4000 örnek
Test verisi bulunamadı
BERT tokenizer yükleniyor: bert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

DataLoader'lar oluşturuluyor...
BERT model oluşturuluyor: bert-base-uncased


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Eğitim başlıyor - 5 epoch
Train batches: 1000
Validation batches: 250

Epoch 1/5


Training:   0%|          | 0/1000 [00:00<?, ?it/s]