# Model Development for Banking Customer Questions Classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.model_selection import cross_val_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. DATA LOADING & EXPLORATION

In [2]:
# Load preprocessed data
print("Loading preprocessed data...")
train_df = pd.read_csv('../data/processed/train_data.csv', encoding='utf-8')
test_df = pd.read_csv('../data/processed/test_data.csv', encoding='utf-8')
class_mapping = pd.read_csv('../data/processed/class_mapping.csv', encoding='utf-8')

print(f"Data loaded successfully!")
print(f"   Training set: {len(train_df)} questions")
print(f"   Test set: {len(test_df)} questions") 
print(f"   Total classes: {train_df['department_target'].nunique()}")

# Extract features and targets
X_train_text = train_df['question_cleaned']
y_train = train_df['department_target']
X_test_text = test_df['question_cleaned']
y_test = test_df['department_target']

# Display class distribution
print(f"\nTraining set class distribution:")
class_dist = y_train.value_counts()
for i, (class_name, count) in enumerate(class_dist.items(), 1):
    percentage = (count / len(y_train)) * 100
    print(f"{i:2d}. {class_name[:35]:<35} {count:4d} ({percentage:5.1f}%)")

# Calculate class imbalance metrics
largest_class = class_dist.max()
smallest_class = class_dist.min()
imbalance_ratio = largest_class / smallest_class

print(f"\nClass imbalance analysis:")
print(f"   Largest class: {largest_class} samples ({(largest_class/len(y_train)*100):.1f}%)")
print(f"   Smallest class: {smallest_class} samples ({(smallest_class/len(y_train)*100):.1f}%)")
print(f"   Imbalance ratio: {imbalance_ratio:.1f}:1")
print(f"   Challenge level: {'High' if imbalance_ratio > 10 else 'Medium' if imbalance_ratio > 5 else 'Low'}")

Loading preprocessed data...
Data loaded successfully!
   Training set: 1565 questions
   Test set: 392 questions
   Total classes: 13

Training set class distribution:
 1. Пазар Ежедневно банкиране            458 ( 29.3%)
 2. Други                                254 ( 16.2%)
 3. Пазар Жилищни и ипотечни кредити     215 ( 13.7%)
 4. Пазар Потребителско кредитиране      115 (  7.3%)
 5. Пазар Разсрочени плащания             96 (  6.1%)
 6. Пазар Малък бизнес                    76 (  4.9%)
 7. ДИРЕКЦИЯ БАНКОВИ ОПЕРАЦИИ             66 (  4.2%)
 8. Пазар Спестяване и инвестиции         66 (  4.2%)
 9. Няколко пазара                        56 (  3.6%)
10. Пазар Банково застраховане            53 (  3.4%)
11. ДИРЕКЦИЯ ПАЗАР АТМ И КАСОВА ДЕЙНОСТ   49 (  3.1%)
12. Support_Services                      35 (  2.2%)
13. ДИРЕКЦИЯ КРЕДИТЕН РИСК ИНДИВИДУАЛНИ   26 (  1.7%)

Class imbalance analysis:
   Largest class: 458 samples (29.3%)
   Smallest class: 26 samples (1.7%)
   Imbalance ratio: 17.6:1


## 2. FEATURE ENGINEERING - TF-IDF VECTORIZATION

In [3]:
print("TF-IDF configuration for Bulgarian text:")

# Configure TF-IDF for Bulgarian text
vectorizer = TfidfVectorizer(
    max_features=5000,          # Reasonable feature count for dataset size
    ngram_range=(1, 2),         # Unigrams and bigrams for context
    min_df=2,                   # Ignore terms that appear in less than 2 documents
    max_df=0.95,                # Ignore terms that appear in more than 95% of documents
    strip_accents='unicode',    # Handle accented characters
    lowercase=True,             # Normalize case
    token_pattern=r'\b\w+\b'    # Word boundaries for proper tokenization
)

print(f"   Max features: {vectorizer.max_features}")
print(f"   N-gram range: {vectorizer.ngram_range}")
print(f"   Min document frequency: {vectorizer.min_df}")
print(f"   Max document frequency: {vectorizer.max_df}")

# Fit and transform training data
print(f"\nFitting TF-IDF vectorizer on training data...")
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

print(f"TF-IDF vectorization completed!")
print(f"   Training matrix shape: {X_train_tfidf.shape}")
print(f"   Test matrix shape: {X_test_tfidf.shape}")
print(f"   Vocabulary size: {len(vectorizer.vocabulary_)}")
print(f"   Sparsity: {(1.0 - X_train_tfidf.nnz / (X_train_tfidf.shape[0] * X_train_tfidf.shape[1]))*100:.1f}%")

# Display most important features
feature_names = vectorizer.get_feature_names_out()
print(f"\nSample vocabulary (first 20 features):")
print(f"   {list(feature_names[:20])}")

TF-IDF configuration for Bulgarian text:
   Max features: 5000
   N-gram range: (1, 2)
   Min document frequency: 2
   Max document frequency: 0.95

Fitting TF-IDF vectorizer on training data...
TF-IDF vectorization completed!
   Training matrix shape: (1565, 3198)
   Test matrix shape: (392, 3198)
   Vocabulary size: 3198
   Sparsity: 99.6%

Sample vocabulary (first 20 features):
   ['00', '000', '000 лв', '000 лева', '000евро', '000евро 10', '000лв', '000лв за', '01', '01 06', '03', '06', '1', '1 2', '10', '10 139', '10 лева', '100', '100 000', '1000']


## 3. BASELINE MODEL - LOGISTIC REGRESSION

In [4]:
print("Training baseline Logistic Regression model...")

# Calculate class weights to handle imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

print(f"Class weights for imbalance handling:")
for class_name, weight in sorted(class_weight_dict.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"   {class_name[:30]:<30} weight: {weight:.2f}")
print(f"   ... (showing top 5)")

# Train baseline model
baseline_model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42,
    solver='liblinear'  # Good for small to medium datasets
)

print(f"\nTraining baseline model...")
baseline_model.fit(X_train_tfidf, y_train)

# Make predictions
y_train_pred_baseline = baseline_model.predict(X_train_tfidf)
y_test_pred_baseline = baseline_model.predict(X_test_tfidf)

# Calculate accuracy scores
train_accuracy_baseline = accuracy_score(y_train, y_train_pred_baseline)
test_accuracy_baseline = accuracy_score(y_test, y_test_pred_baseline)

print(f"Baseline model training completed!")
print(f"   Training accuracy: {train_accuracy_baseline:.4f}")
print(f"   Test accuracy: {test_accuracy_baseline:.4f}")
print(f"   Overfitting check: {abs(train_accuracy_baseline - test_accuracy_baseline):.4f} difference")

# Cross-validation for more robust evaluation
cv_scores = cross_val_score(baseline_model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
print(f"   Cross-validation accuracy: {cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})")

Training baseline Logistic Regression model...
Class weights for imbalance handling:
   ДИРЕКЦИЯ КРЕДИТЕН РИСК ИНДИВИД weight: 4.63
   Support_Services               weight: 3.44
   ДИРЕКЦИЯ ПАЗАР АТМ И КАСОВА ДЕ weight: 2.46
   Пазар Банково застраховане     weight: 2.27
   Няколко пазара                 weight: 2.15
   ... (showing top 5)

Training baseline model...
Baseline model training completed!
   Training accuracy: 0.8843
   Test accuracy: 0.6888
   Overfitting check: 0.1956 difference
   Cross-validation accuracy: 0.6671 (±0.0260)


## 4. MODEL COMPARISON - MULTIPLE ALGORITHMS

In [5]:
print("Training multiple models for comparison...")

# Define models to compare - focused on sklearn models that handle string labels well
models = {
    'Logistic Regression': LogisticRegression(
        class_weight='balanced', 
        max_iter=1000, 
        random_state=42,
        solver='liblinear'
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1,
        max_depth=10  # Prevent overfitting
    ),
    'Naive Bayes': MultinomialNB(
        alpha=1.0  # Smoothing parameter
    ),
    'SVM (Linear)': SVC(
        kernel='linear',
        class_weight='balanced',
        random_state=42,
        probability=True,  # Enable probability estimates
        C=1.0  # Regularization
    ),
    'Extra Trees': ExtraTreesClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1,
        max_depth=8
    ),
    'SVM (RBF)': SVC(
        kernel='rbf',
        class_weight='balanced',
        random_state=42,
        probability=True,
        C=1.0,
        gamma='scale'
    )
}

# Add Extra Trees to imports at the top
from sklearn.ensemble import ExtraTreesClassifier

# Train and evaluate each model
model_results = {}
trained_models = {}

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    try:
        # Train model - all sklearn models handle string labels automatically
        model.fit(X_train_tfidf, y_train)
        
        # Make predictions
        y_train_pred = model.predict(X_train_tfidf)
        y_test_pred = model.predict(X_test_tfidf)
        
        # Calculate metrics
        train_acc = accuracy_score(y_train, y_train_pred)
        test_acc = accuracy_score(y_test, y_test_pred)
        
        # Cross-validation
        cv_scores = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
        
        # Store results
        model_results[model_name] = {
            'train_accuracy': train_acc,
            'test_accuracy': test_acc,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'overfitting': abs(train_acc - test_acc)
        }
        
        # Store trained model
        trained_models[model_name] = model
        
        print(f"   {model_name}: Test accuracy = {test_acc:.4f}, CV = {cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})")
        
    except Exception as e:
        print(f"   {model_name} failed: {str(e)}")
        continue

# Display comparison results
print(f"\nMODEL COMPARISON RESULTS:")
print(f"{'Model':<20} {'Train Acc':<10} {'Test Acc':<10} {'CV Mean':<10} {'CV Std':<10} {'Overfitting':<12}")
print("-" * 80)

for model_name, results in model_results.items():
    print(f"{model_name:<20} {results['train_accuracy']:<10.4f} {results['test_accuracy']:<10.4f} "
          f"{results['cv_mean']:<10.4f} {results['cv_std']:<10.4f} {results['overfitting']:<12.4f}")

# Select best model based on test accuracy and cross-validation
best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['cv_mean'])
best_model = trained_models[best_model_name]

print(f"\nBest performing model: {best_model_name}")
print(f"   • Cross-validation accuracy: {model_results[best_model_name]['cv_mean']:.4f}")
print(f"   • Test accuracy: {model_results[best_model_name]['test_accuracy']:.4f}")

Training multiple models for comparison...

Training Logistic Regression...
   Logistic Regression: Test accuracy = 0.6888, CV = 0.6671 (±0.0260)

Training Random Forest...
   Random Forest: Test accuracy = 0.5408, CV = 0.5361 (±0.0792)

Training Naive Bayes...
   Naive Bayes: Test accuracy = 0.5000, CV = 0.4690 (±0.0240)

Training SVM (Linear)...
   SVM (Linear): Test accuracy = 0.6837, CV = 0.6294 (±0.0379)

Training Extra Trees...
   Extra Trees: Test accuracy = 0.5459, CV = 0.5380 (±0.0405)

Training SVM (RBF)...
   SVM (RBF): Test accuracy = 0.6709, CV = 0.6070 (±0.0692)

MODEL COMPARISON RESULTS:
Model                Train Acc  Test Acc   CV Mean    CV Std     Overfitting 
--------------------------------------------------------------------------------
Logistic Regression  0.8843     0.6888     0.6671     0.0130     0.1956      
Random Forest        0.7125     0.5408     0.5361     0.0396     0.1716      
Naive Bayes          0.6115     0.5000     0.4690     0.0120     0.1115    

## 5. HYPERPARAMETER TUNING - TOP MODELS

In [6]:
# Select top 3 models for hyperparameter tuning
top_models = sorted(model_results.items(), key=lambda x: x[1]['cv_mean'], reverse=True)[:3]
print(f"Tuning hyperparameters for top 3 models:")
for i, (name, results) in enumerate(top_models, 1):
    print(f"   {i}. {name}: {results['cv_mean']:.4f} CV accuracy")

# Import grid search
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')

# Store tuning results
tuned_models = {}
tuning_results = {}

for model_name, _ in top_models:
    print(f"\nHyperparameter tuning for {model_name}...")
    
    base_model = models[model_name]
    
    # Define parameter grids for different models
    if model_name == 'Logistic Regression':
        param_grid = {
            'C': [0.1, 0.5, 1.0, 2.0, 5.0],
            'solver': ['liblinear', 'saga'],
            'penalty': ['l1', 'l2'],
            'max_iter': [1000, 2000]
        }
        search_method = 'grid'
        
    elif model_name == 'SVM (Linear)':
        param_grid = {
            'C': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
            'penalty': ['l1', 'l2'],
            'loss': ['hinge', 'squared_hinge'],
            'dual': [True, False]
        }
        # Use LinearSVC for faster tuning
        from sklearn.svm import LinearSVC
        base_model = LinearSVC(class_weight='balanced', random_state=42, max_iter=2000)
        search_method = 'grid'
        
    elif model_name == 'SVM (RBF)':
        param_grid = {
            'C': [0.1, 1.0, 10.0],
            'gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
            'kernel': ['rbf']
        }
        search_method = 'grid'
        
    elif 'Random Forest' in model_name or 'Extra Trees' in model_name:
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, 15, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', None]
        }
        search_method = 'random'  # Too many combinations for grid
        
    elif 'Naive Bayes' in model_name:
        param_grid = {
            'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],
            'fit_prior': [True, False]
        }
        search_method = 'grid'
    
    else:
        print(f"   No parameter grid defined for {model_name}, skipping...")
        continue
    
    # Perform hyperparameter search
    try:
        if search_method == 'grid':
            search = GridSearchCV(
                base_model,
                param_grid,
                cv=5,
                scoring='accuracy',
                n_jobs=-1,
                verbose=0
            )
        else:  # random search
            search = RandomizedSearchCV(
                base_model,
                param_grid,
                n_iter=20,  # Limited iterations for speed
                cv=5,
                scoring='accuracy',
                n_jobs=-1,
                random_state=42,
                verbose=0
            )
        
        # Fit the search
        search.fit(X_train_tfidf, y_train)
        
        # Get best model
        best_tuned_model = search.best_estimator_
        
        # Evaluate on test set
        y_test_pred_tuned = best_tuned_model.predict(X_test_tfidf)
        test_acc_tuned = accuracy_score(y_test, y_test_pred_tuned)
        
        # Store results
        tuned_models[model_name] = best_tuned_model
        tuning_results[model_name] = {
            'best_params': search.best_params_,
            'best_cv_score': search.best_score_,
            'test_accuracy': test_acc_tuned,
            'original_cv': model_results[model_name]['cv_mean'],
            'original_test': model_results[model_name]['test_accuracy'],
            'cv_improvement': search.best_score_ - model_results[model_name]['cv_mean'],
            'test_improvement': test_acc_tuned - model_results[model_name]['test_accuracy']
        }
        
        print(f"   Tuning completed!")
        print(f"      Best CV score: {search.best_score_:.4f} (improvement: {tuning_results[model_name]['cv_improvement']:+.4f})")
        print(f"      Test accuracy: {test_acc_tuned:.4f} (improvement: {tuning_results[model_name]['test_improvement']:+.4f})")
        print(f"      Best parameters: {search.best_params_}")
        
    except Exception as e:
        print(f"   Tuning failed for {model_name}: {str(e)}")
        continue

# Display tuning summary
if tuning_results:
    print(f"\nHYPERPARAMETER TUNING SUMMARY:")
    print(f"{'Model':<20} {'Original CV':<12} {'Tuned CV':<12} {'CV Improve':<12} {'Original Test':<14} {'Tuned Test':<12} {'Test Improve'}")
    print("-" * 105)
    
    for model_name, results in tuning_results.items():
        print(f"{model_name:<20} {results['original_cv']:<12.4f} {results['best_cv_score']:<12.4f} "
              f"{results['cv_improvement']:<12.4f} {results['original_test']:<14.4f} "
              f"{results['test_accuracy']:<12.4f} {results['test_improvement']:<12.4f}")
    
    # Find overall best model after tuning
    best_tuned_model_name = max(tuning_results.keys(), key=lambda x: tuning_results[x]['best_cv_score'])
    final_best_model = tuned_models[best_tuned_model_name]
    
    print(f"\nFINAL BEST MODEL AFTER TUNING: {best_tuned_model_name}")
    print(f"   Tuned CV accuracy: {tuning_results[best_tuned_model_name]['best_cv_score']:.4f}")
    print(f"   Tuned test accuracy: {tuning_results[best_tuned_model_name]['test_accuracy']:.4f}")
    print(f"   Best parameters: {tuning_results[best_tuned_model_name]['best_params']}")
    
    # Update best model for final evaluation
    best_model = final_best_model
    best_model_name = best_tuned_model_name

else:
    print(f"\nNo successful hyperparameter tuning completed.")

Tuning hyperparameters for top 3 models:
   1. Logistic Regression: 0.6671 CV accuracy
   2. SVM (Linear): 0.6294 CV accuracy
   3. SVM (RBF): 0.6070 CV accuracy

Hyperparameter tuning for Logistic Regression...
   Tuning completed!
      Best CV score: 0.6786 (improvement: +0.0115)
      Test accuracy: 0.6964 (improvement: +0.0077)
      Best parameters: {'C': 5.0, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}

Hyperparameter tuning for SVM (Linear)...
   Tuning completed!
      Best CV score: 0.6754 (improvement: +0.0460)
      Test accuracy: 0.6913 (improvement: +0.0077)
      Best parameters: {'C': 1.0, 'dual': True, 'loss': 'squared_hinge', 'penalty': 'l2'}

Hyperparameter tuning for SVM (RBF)...
   Tuning completed!
      Best CV score: 0.6415 (improvement: +0.0345)
      Test accuracy: 0.6913 (improvement: +0.0204)
      Best parameters: {'C': 10.0, 'gamma': 0.1, 'kernel': 'rbf'}

HYPERPARAMETER TUNING SUMMARY:
Model                Original CV  Tuned CV     CV Improv

## 6. MODEL SAVING

In [7]:
# Create models directory
import os
os.makedirs('../models', exist_ok=True)

# Save the best model and vectorizer
print("Saving best model and vectorizer...")

# Save the vectorizer (critical for preprocessing new inputs)
joblib.dump(vectorizer, '../models/tfidf_vectorizer.pkl')
print(f"   Vectorizer saved: ../models/tfidf_vectorizer.pkl")

# Save the best trained model
joblib.dump(best_model, f'../models/best_model_{best_model_name.lower().replace(" ", "_")}.pkl')
print(f"   Best model saved: ../models/best_model_{best_model_name.lower().replace(" ", "_")}.pkl")

# Save class mapping for interpretability
class_mapping.to_csv('../models/class_mapping.csv', index=False, encoding='utf-8')
print(f"   Class mapping saved: ../models/class_mapping.csv")

# Save model metadata
model_metadata = {
    'model_name': best_model_name,
    'model_type': type(best_model).__name__,
    'cv_accuracy': tuning_results[best_tuned_model_name]['best_cv_score'] if tuning_results else model_results[best_model_name]['cv_mean'],
    'test_accuracy': tuning_results[best_tuned_model_name]['test_accuracy'] if tuning_results else model_results[best_model_name]['test_accuracy'],
    'best_parameters': tuning_results[best_tuned_model_name]['best_params'] if tuning_results else 'default',
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'training_samples': len(X_train_text),
    'test_samples': len(X_test_text),
    'num_classes': len(y_train.unique()),
    'vectorizer_params': {
        'max_features': vectorizer.max_features,
        'ngram_range': vectorizer.ngram_range,
        'min_df': vectorizer.min_df,
        'max_df': vectorizer.max_df
    }
}

# Save metadata as JSON
import json
with open('../models/model_metadata.json', 'w', encoding='utf-8') as f:
    json.dump(model_metadata, f, indent=2, ensure_ascii=False)
print(f"   Model metadata saved: ../models/model_metadata.json")

print(f"\nModel package ready for deployment!")
print(f"   Model: {best_model_name}")
print(f"   Accuracy: {model_metadata['test_accuracy']:.4f}")
print(f"   Files saved in: ../models/")
print(f"Saved in: ../models/")
print(f"Test accuracy: {model_metadata['test_accuracy']:.4f}")
print(f"Best model: {best_model_name}")
print(f"\nModel training and evaluation completed successfully!")



Saving best model and vectorizer...
   Vectorizer saved: ../models/tfidf_vectorizer.pkl
   Best model saved: ../models/best_model_logistic_regression.pkl
   Class mapping saved: ../models/class_mapping.csv
   Model metadata saved: ../models/model_metadata.json

Model package ready for deployment!
   Model: Logistic Regression
   Accuracy: 0.6964
   Files saved in: ../models/
Saved in: ../models/
Test accuracy: 0.6964
Best model: Logistic Regression

Model training and evaluation completed successfully!


## 7. INTERACTIVE MODEL TESTING

In [8]:
def predict_question_department(question_text, show_probabilities=True):
    """
    Predict department for a new question
    
    Args:
        question_text (str): Bulgarian question text
        show_probabilities (bool): Whether to show prediction probabilities
    
    Returns:
        dict: Prediction results
    """
    # Preprocess the input (SAME AS TRAINING DATA)
    def clean_bulgarian_text(text):
        """Clean Bulgarian text for NLP processing - SAME AS PREPROCESSING"""
        import re
        import pandas as pd
        
        if pd.isna(text):
            return ""
        
        # Convert to string and lowercase
        text = str(text).lower()
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Remove leading/trailing whitespace
        text = text.strip()
        
        # Remove excessive punctuation (keep single punctuation)
        text = re.sub(r'[.]{2,}', '.', text)
        text = re.sub(r'[!]{2,}', '!', text)
        text = re.sub(r'[?]{2,}', '?', text)
        
        # Normalize quotes
        text = re.sub(r'[""„"]', '"', text)
        text = re.sub(r'[''`]', "'", text)
        
        return text
    
    # Apply SAME preprocessing as training (ONLY cleaning)
    processed_text = clean_bulgarian_text(question_text)
    
    # Vectorize using saved vectorizer
    text_vector = vectorizer.transform([processed_text])
    
    # Predict using best model
    prediction = best_model.predict(text_vector)[0]
    
    # Get probabilities if available
    if hasattr(best_model, 'predict_proba'):
        probabilities = best_model.predict_proba(text_vector)[0]
        class_names = best_model.classes_
        prob_dict = dict(zip(class_names, probabilities))
        
        # Sort by probability
        sorted_probs = sorted(prob_dict.items(), key=lambda x: x[1], reverse=True)
    else:
        sorted_probs = None
    
    # Create result
    result = {
        'original_text': question_text,
        'processed_text': processed_text,
        'predicted_department': prediction,
        'confidence': max(probabilities) if sorted_probs else None,
        'top_3_predictions': sorted_probs[:3] if sorted_probs else None
    }
    
    return result

# Test with some sample questions
print("Testing model with sample questions...")
print("\n" + "-"*50)

test_questions = [
   
   "Искам да кандидатствам за ипотечен кредит за жилище",
   "Колко е първоначалната вноска за жилищен кредит?",
   
   
   "Колко е лихвата за потребителски кредит?",
   "Искам потребителски кредит за кола",
   "Каква е максималната сума за потребителски кредит?",
   "Какви са изискванията за потребителски кредит?",
   
   
   "Искам бизнес сметка за моята фирма"
]

for i, question in enumerate(test_questions, 1):
    print(f"\n{i}. Testing: '{question}'")
    result = predict_question_department(question)
    
    print(f"   Predicted department: {result['predicted_department']}")
    if result['confidence']:
        print(f"   Confidence: {result['confidence']:.3f}")
    
    if result['top_3_predictions']:
        print(f"   Top 3 predictions:")
        for j, (dept, prob) in enumerate(result['top_3_predictions'], 1):
            print(f"      {j}. {dept}: {prob:.3f}")
    
    print(f"   Processed text: '{result['processed_text']}'")

Testing model with sample questions...

--------------------------------------------------

1. Testing: 'Искам да кандидатствам за ипотечен кредит за жилище'
   Predicted department: Пазар Жилищни и ипотечни кредити
   Confidence: 0.832
   Top 3 predictions:
      1. Пазар Жилищни и ипотечни кредити: 0.832
      2. ДИРЕКЦИЯ БАНКОВИ ОПЕРАЦИИ: 0.054
      3. Пазар Потребителско кредитиране: 0.022
   Processed text: 'искам да кандидатствам за ипотечен кредит за жилище'

2. Testing: 'Колко е първоначалната вноска за жилищен кредит?'
   Predicted department: Пазар Жилищни и ипотечни кредити
   Confidence: 0.802
   Top 3 predictions:
      1. Пазар Жилищни и ипотечни кредити: 0.802
      2. ДИРЕКЦИЯ БАНКОВИ ОПЕРАЦИИ: 0.045
      3. Пазар Потребителско кредитиране: 0.039
   Processed text: 'колко е първоначалната вноска за жилищен кредит?'

3. Testing: 'Колко е лихвата за потребителски кредит?'
   Predicted department: Пазар Потребителско кредитиране
   Confidence: 0.807
   Top 3 predictions:

## 7. INTERACTIVE MODEL TESTING v2 with Model Data

In [10]:
import pandas as pd

# Load and prepare CSV
print("Preparing test data...")
df = pd.read_excel('../data/raw/Коментари за сортиране.xlsx', header=None, skiprows=1)
df.columns = ['index', 'message_uid', 'created_dttm', 'question', 'department', 'col5', 'col6', 'market_segment']
test_df = df[['question', 'department']].dropna().rename(columns={'department': 'expected_department'})
print(f"Loaded {len(test_df)} questions")

# Run predictions
print("\n" + "="*50)
print("CSV PREDICTION TEST")
print("="*50)

correct = 0
results = []

for idx, row in test_df.iterrows():
   question = row['question']
   expected = row['expected_department']
   
   result = predict_question_department(question)
   predicted = result['predicted_department']
   confidence = result['confidence']
   
   is_correct = predicted == expected
   if is_correct: correct += 1
   
   print(f"{idx+1:4d}. {'OK' if is_correct else 'NOT OK'} {confidence:.3f} | {predicted}")
   if not is_correct:
       print(f"       Expected: {expected}")
   
   results.append({'question': question, 'expected': expected, 'predicted': predicted, 'confidence': confidence, 'correct': is_correct})

# Summary
accuracy = correct / len(test_df)
avg_conf = sum(r['confidence'] for r in results) / len(results)

print(f"\nSUMMARY:")
print(f"Accuracy: {correct}/{len(test_df)} ({accuracy*100:.1f}%)")
print(f"Avg Confidence: {avg_conf:.3f}")

# Export results to Excel
print("\nEXPORTING RESULTS TO EXCEL...")

# Create results DataFrame
results_df = pd.DataFrame(results)

# Add status column for better readability
results_df['status'] = results_df['correct'].map({True: 'Correct', False: 'Wrong'})

# Reorder columns for better presentation
export_df = results_df[['question', 'expected', 'predicted', 'confidence', 'status', 'correct']]

# Create models directory if it doesn't exist
import os
os.makedirs('../models', exist_ok=True)

# Export to Excel with formatting
excel_path = '../models/model_test_results.xlsx'
with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
   # Main results
   export_df.to_excel(writer, sheet_name='Test Results', index=False)
   
   # Summary statistics
   summary_data = {
       'Metric': ['Total Questions', 'Correct Predictions', 'Wrong Predictions', 'Accuracy %', 'Average Confidence'],
       'Value': [len(test_df), correct, len(test_df)-correct, f"{accuracy*100:.1f}%", f"{avg_conf:.3f}"]
   }
   pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)
   
   # Department accuracy breakdown
   dept_stats = []
   for dept in results_df['expected'].unique():
       dept_results = results_df[results_df['expected'] == dept]
       dept_correct = dept_results['correct'].sum()
       dept_total = len(dept_results)
       dept_acc = (dept_correct / dept_total * 100) if dept_total > 0 else 0
       dept_stats.append({
           'Department': dept,
           'Total': dept_total,
           'Correct': dept_correct,
           'Accuracy %': f"{dept_acc:.1f}%"
       })
   
   pd.DataFrame(dept_stats).sort_values('Accuracy %', ascending=False).to_excel(
       writer, sheet_name='Department Analysis', index=False)

print(f"Results exported to: {excel_path}")
print(f"   Sheet 1: Test Results ({len(export_df)} rows)")
print(f"   Sheet 2: Summary Statistics") 
print(f"   Sheet 3: Department Analysis")

Preparing test data...
Loaded 1962 questions

CSV PREDICTION TEST
   1. OK 0.683 | ДИРЕКЦИЯ ПАЗАР АТМ И КАСОВА ДЕЙНОСТ
   2. OK 0.533 | ДИРЕКЦИЯ БАНКОВИ ОПЕРАЦИИ
   3. NOT OK 0.315 | Пазар Жилищни и ипотечни кредити
       Expected: Пазар Ежедневно банкиране
   4. OK 0.561 | Пазар Ежедневно банкиране
   5. OK 0.735 | Пазар Ежедневно банкиране
   6. OK 0.714 | Пазар Потребителско кредитиране
   7. NOT OK 0.368 | Пазар Потребителско кредитиране
       Expected: Пазар Ежедневно банкиране
   8. OK 0.595 | Пазар Потребителско кредитиране
   9. OK 0.739 | Пазар Малък бизнес
  10. OK 0.597 | Други
  11. OK 0.778 | Пазар Ежедневно банкиране
  12. OK 0.576 | Няколко пазара
  13. OK 0.600 | Пазар Ежедневно банкиране
  14. OK 0.680 | Пазар Ежедневно банкиране
  15. OK 0.628 | Пазар Жилищни и ипотечни кредити
  16. OK 0.695 | Пазар Спестяване и инвестиции
  17. OK 0.830 | Пазар Ежедневно банкиране
  18. NOT OK 0.566 | Пазар Банково застраховане
       Expected: Пазар Ежедневно банкиране
  19. OK 0