# BERT Deep Learning Model - Bonus Implementation
## Advanced Deep Learning Approach with GPU Acceleration
## Objective: As a bonus to the traditional ML pipeline, this implementation uses BERT (Bidirectional Encoder Representations from Transformers) for customer query classification.

## Why BERT as Bonus?
### Deep Learning Alternative: Complements the classical ML approaches (Logistic Regression, SVM, Random Forest, XGBoost)
### State-of-the-art Performance: Transformer-based architecture for superior text understanding
### GPU Training: Implemented in Google Colab for faster training with GPU acceleration
### Transfer Learning: Leverages pre-trained BERT model fine-tuned on our customer query data


## Model Details
### Architecture: BERT-base-uncased with custom classification head
### Training: 5 epochs with AdamW optimizer and learning rate scheduling
### Hardware: Google Colab GPU runtime for accelerated training
### Data: Same preprocessed customer queries from the main pipeline

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AutoConfig,
    get_linear_schedule_with_warmup
)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# GPU check
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

class CustomerQueryDataset(Dataset):
    """Custom dataset class for customer queries"""
    
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenization
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class BERTClassifier(nn.Module):
    """BERT/DistilBERT based classifier model"""
    
    def __init__(self, model_name, num_classes, dropout_rate=0.3):
        super(BERTClassifier, self).__init__()
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.bert = AutoModel.from_pretrained(model_name)
        self.model_name = model_name
        
        # Auto-detect hidden size
        if hasattr(self.config, 'hidden_size'):
            hidden_size = self.config.hidden_size
        elif hasattr(self.config, 'dim'):  # For DistilBERT
            hidden_size = self.config.dim
        else:
            hidden_size = 768  # default
        
        print(f"Model created - Hidden size: {hidden_size}, Classes: {num_classes}")
        
        # Classification head
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, input_ids, attention_mask):
        # BERT/DistilBERT output
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # Get [CLS] token representation (first token)
        # DistilBERT doesn't have pooler_output, we must use last_hidden_state
        if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
            # For BERT
            pooled_output = outputs.pooler_output
        else:
            # For DistilBERT - get [CLS] token (first token)
            pooled_output = outputs.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]
        
        # Dropout and classification
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        return logits
        
    def forward(self, input_ids, attention_mask):
        # BERT/DistilBERT output
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # Get [CLS] token representation (first token)
        # DistilBERT doesn't have pooler_output, we must use last_hidden_state
        if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
            # For BERT
            pooled_output = outputs.pooler_output
        else:
            # For DistilBERT - get [CLS] token (first token)
            pooled_output = outputs.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]
        
        # Dropout and classification
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        return logits

class BERTTrainer:
    """BERT model training and evaluation class"""
    
    def __init__(self, model, train_loader, val_loader, num_classes, device):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.num_classes = num_classes
        self.device = device
        
        # Loss function
        self.criterion = nn.CrossEntropyLoss()
        
        # Optimizer
        self.optimizer = torch.optim.AdamW(
            self.model.parameters(),
            lr=2e-5,
            weight_decay=0.01
        )
        
        # Scheduler
        total_steps = len(train_loader) * 5  # 5 epochs assumption
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=total_steps // 10,
            num_training_steps=total_steps
        )
        
        # Metrics tracking
        self.train_losses = []
        self.val_losses = []
        self.val_accuracies = []
    
    def train_epoch(self):
        """One epoch training"""
        self.model.train()
        total_loss = 0
        
        for batch in tqdm(self.train_loader, desc="Training"):
            # Move data to GPU
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['labels'].to(self.device)
            
            # Forward pass
            self.optimizer.zero_grad()
            logits = self.model(input_ids, attention_mask)
            loss = self.criterion(logits, labels)
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            self.optimizer.step()
            self.scheduler.step()
            
            total_loss += loss.item()
        
        return total_loss / len(self.train_loader)
    
    def validate(self):
        """Evaluation on validation set"""
        self.model.eval()
        total_loss = 0
        all_predictions = []
        all_labels = []
        
        with torch.no_grad():
            for batch in tqdm(self.val_loader, desc="Validation"):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                
                logits = self.model(input_ids, attention_mask)
                loss = self.criterion(logits, labels)
                
                total_loss += loss.item()
                
                # Predictions
                predictions = torch.argmax(logits, dim=1)
                all_predictions.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        val_loss = total_loss / len(self.val_loader)
        val_accuracy = accuracy_score(all_labels, all_predictions)
        
        return val_loss, val_accuracy, all_predictions, all_labels
    
    def train(self, epochs=5):
        """Model training"""
        print(f"Training started - {epochs} epochs")
        print(f"Train batches: {len(self.train_loader)}")
        print(f"Validation batches: {len(self.val_loader)}")
        
        best_val_accuracy = 0
        
        for epoch in range(epochs):
            print(f"\n{'='*50}")
            print(f"Epoch {epoch + 1}/{epochs}")
            print(f"{'='*50}")
            
            # Training
            train_loss = self.train_epoch()
            self.train_losses.append(train_loss)
            
            # Validation
            val_loss, val_accuracy, val_predictions, val_labels = self.validate()
            self.val_losses.append(val_loss)
            self.val_accuracies.append(val_accuracy)
            
            # Print results
            print(f"Train Loss: {train_loss:.4f}")
            print(f"Val Loss: {val_loss:.4f}")
            print(f"Val Accuracy: {val_accuracy:.4f}")
            
            # Save best model
            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                torch.save(self.model.state_dict(), 'best_bert_model.pth')
                print(f"Best model saved! Accuracy: {best_val_accuracy:.4f}")
        
        print(f"\nTraining completed! Best validation accuracy: {best_val_accuracy:.4f}")
        return val_predictions, val_labels

def load_data():
    """Load data"""
    print("Loading data...")
    
    # Train data
    train_df = pd.read_csv('train_split.csv')
    print(f"Train data loaded: {len(train_df)} samples")
    
    # Validation data
    val_df = pd.read_csv('val_split.csv')
    print(f"Validation data loaded: {len(val_df)} samples")
    
    # Test data (if available)
    try:
        test_df = pd.read_csv('customer_queries_test.csv')
        print(f"Test data loaded: {len(test_df)} samples")
    except:
        test_df = None
        print("Test data not found")
    
    return train_df, val_df, test_df

def create_data_loaders(train_df, val_df, tokenizer, batch_size=16, max_length=128):
    """Create DataLoaders"""
    print("Creating DataLoaders...")
    
    # Datasets
    train_dataset = CustomerQueryDataset(
        train_df['query_original'].values,
        train_df['label_encoded'].values,
        tokenizer,
        max_length
    )
    
    val_dataset = CustomerQueryDataset(
        val_df['query_original'].values,
        val_df['label_encoded'].values,
        tokenizer,
        max_length
    )
    
    # DataLoaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2
    )
    
    return train_loader, val_loader

def plot_training_metrics(trainer):
    """Visualize training metrics"""
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Loss plot
    epochs = range(1, len(trainer.train_losses) + 1)
    axes[0].plot(epochs, trainer.train_losses, 'b-', label='Train Loss')
    axes[0].plot(epochs, trainer.val_losses, 'r-', label='Validation Loss')
    axes[0].set_title('Model Loss')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].legend()
    axes[0].grid(True)
    
    # Accuracy plot
    axes[1].plot(epochs, trainer.val_accuracies, 'g-', label='Validation Accuracy')
    axes[1].set_title('Model Accuracy')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Accuracy')
    axes[1].legend()
    axes[1].grid(True)
    
    plt.tight_layout()
    plt.savefig('training_metrics.png', dpi=300, bbox_inches='tight')
    plt.show()

def evaluate_model(predictions, true_labels, num_classes):
    """Detailed model evaluation"""
    print("\n" + "="*50)
    print("DETAILED MODEL EVALUATION")
    print("="*50)
    
    # Overall metrics
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Overall Accuracy: {accuracy:.4f}")
    
    # Class-wise report
    print("\nClass-wise Performance:")
    class_report = classification_report(
        true_labels, 
        predictions, 
        target_names=[f"Class_{i}" for i in range(num_classes)],
        output_dict=True
    )
    
    # Show class performance as DataFrame
    class_df = pd.DataFrame(class_report).transpose()
    print(class_df.round(3))
    
    # Confusion Matrix
    cm = confusion_matrix(true_labels, predictions)
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    return class_report

def predict_single_query(model, tokenizer, query, device, max_length=128):
    """Prediction for a single query"""
    model.eval()
    
    # Tokenize
    encoding = tokenizer(
        query,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        probabilities = torch.softmax(logits, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        confidence = probabilities[0][predicted_class].item()
    
    return predicted_class, confidence, probabilities[0].cpu().numpy()

def main():
    """Main function"""
    print("BERT Customer Query Classifier")
    print("="*50)
    
    # Parameters
    MODEL_NAME = 'bert-base-uncased'  # or 'distilbert-base-uncased' for faster training
    MAX_LENGTH = 128
    BATCH_SIZE = 16
    EPOCHS = 5
    NUM_CLASSES = 30
    
    # 1. Load data
    train_df, val_df, test_df = load_data()
    
    # 2. Load tokenizer
    print(f"Loading BERT tokenizer: {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # 3. Create DataLoaders
    train_loader, val_loader = create_data_loaders(
        train_df, val_df, tokenizer, BATCH_SIZE, MAX_LENGTH
    )
    
    # 4. Create model
    print(f"Creating BERT model: {MODEL_NAME}")
    model = BERTClassifier(MODEL_NAME, NUM_CLASSES)
    
    # 5. Create trainer
    trainer = BERTTrainer(model, train_loader, val_loader, NUM_CLASSES, device)
    
    # 6. Model training
    val_predictions, val_labels = trainer.train(epochs=EPOCHS)
    
    # 7. Visualize results
    plot_training_metrics(trainer)
    
    # 8. Detailed evaluation
    class_report = evaluate_model(val_predictions, val_labels, NUM_CLASSES)
    
    # 9. Sample predictions
    print("\n" + "="*50)
    print("SAMPLE PREDICTIONS")
    print("="*50)
    
    example_queries = [
        "I need help with my booking",
        "How can I change my flight?",
        "Payment was declined",
        "App is not working",
        "Where is my luggage?"
    ]
    
    model.load_state_dict(torch.load('best_bert_model.pth'))
    
    for query in example_queries:
        predicted_class, confidence, probs = predict_single_query(
            model, tokenizer, query, device
        )
        print(f"Query: '{query}'")
        print(f"Prediction: Class_{predicted_class} (Confidence: {confidence:.3f})")
        print("-" * 30)
    
    print("\nTraining and evaluation completed!")
    print("Saved files:")
    print("   - best_bert_model.pth (Best model)")
    print("   - training_metrics.png (Training plots)")
    print("   - confusion_matrix.png (Confusion matrix)")

if __name__ == "__main__":
    main()