In [2]:
# 04 - Traditional ML Baselines

## Overview
Comprehensive baseline modeling for evasion detection using:
1. TF-IDF + Logistic Regression (answer-only, question-only, combined)
2. TF-IDF + XGBoost with feature importance
3. Engineered linguistic features + models
4. Hybrid models (TF-IDF + engineered features)
5. Class imbalance handling strategies

## Experimental Setup
- Stratified 80/10/10 train/val/test splits (seed=42)
- MLflow experiment tracking
- Cross-validation on train+val
- Final test set evaluation

## Outputs
- Baseline performance comparison table
- Feature importance visualizations
- Best traditional baseline model saved
- Comprehensive summary of findings

SyntaxError: invalid syntax (649255565.py, line 4)

In [None]:
# ============================================================================
# SECTION 1: SETUP AND IMPORTS
# ============================================================================

import sys
import pathlib
import os
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
proj_root = pathlib.Path('..').resolve()
if str(proj_root) not in sys.path:
    sys.path.insert(0, str(proj_root))

# Core imports
from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import json
import pickle
from datetime import datetime
from tqdm import tqdm

# Scikit-learn imports
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import (
    classification_report, confusion_matrix, f1_score, 
    precision_score, recall_score, accuracy_score,
    roc_auc_score, matthews_corrcoef
)
from sklearn.base import BaseEstimator, TransformerMixin

# XGBoost
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    print("⚠️ XGBoost not available. Install with: pip install xgboost")
    XGBOOST_AVAILABLE = False

# Imbalanced-learn
try:
    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.pipeline import Pipeline as ImbPipeline
    IMBLEARN_AVAILABLE = True
except ImportError:
    print("⚠️ imbalanced-learn not available. Install with: pip install imbalanced-learn")
    IMBLEARN_AVAILABLE = False

# NLP libraries
import spacy
import nltk
from textblob import TextBlob
try:
    import textstat
    TEXTSTAT_AVAILABLE = True
except ImportError:
    print("⚠️ textstat not available. Install with: pip install textstat")
    TEXTSTAT_AVAILABLE = False

# MLflow
try:
    import mlflow
    import mlflow.sklearn
    MLFLOW_AVAILABLE = True
except ImportError:
    print("⚠️ MLflow not available. Install with: pip install mlflow")
    MLFLOW_AVAILABLE = False

# Project utilities
try:
    from src.utils import set_seed
except Exception as e:
    print(f'Warning: could not import src.utils.set_seed: {e}')
    def set_seed(seed=42):
        np.random.seed(seed)

# Set random seed
set_seed(42)

# Create output directories
os.makedirs('notebooks/figures', exist_ok=True)
os.makedirs('models/baseline_traditional', exist_ok=True)

# ============================================================================
# SECTION 2: DATA LOADING AND TRAIN/VAL/TEST SPLIT
# ============================================================================

print("=" * 80)
print("SECTION 2: DATA LOADING AND SPLITTING")
print("=" * 80)

# Load dataset
print("\nLoading dataset FutureMa/EvasionBench...")
ds = load_dataset("FutureMa/EvasionBench")
if isinstance(ds, dict):
    ds = ds[list(ds.keys())[0]]
df = ds.to_pandas()
print(f"✅ Dataset loaded: {df.shape[0]:,} samples")

# Display basic info
print("\nDataset Overview:")
print(f"  Columns: {df.columns.tolist()}")
print(f"  Label distribution:")
label_counts = df['eva4b_label'].value_counts()
for label, count in label_counts.items():
    pct = count / len(df) * 100
    print(f"    {label}: {count:,} ({pct:.1f}%)")

# Create stratified 80/10/10 split
print("\nCreating stratified 80/10/10 train/val/test split...")

# First split: 80% train, 20% temp
train_df, temp_df = train_test_split(
    df, 
    test_size=0.2, 
    stratify=df['eva4b_label'], 
    random_state=42
)

# Second split: 10% val, 10% test (from the 20% temp)
val_df, test_df = train_test_split(
    temp_df, 
    test_size=0.5, 
    stratify=temp_df['eva4b_label'], 
    random_state=42
)

print(f"\n✅ Split complete:")
print(f"  Train: {len(train_df):,} samples ({len(train_df)/len(df)*100:.1f}%)")
print(f"  Val:   {len(val_df):,} samples ({len(val_df)/len(df)*100:.1f}%)")
print(f"  Test:  {len(test_df):,} samples ({len(test_df)/len(df)*100:.1f}%)")

# Verify stratification
print("\nLabel distribution verification:")
for split_name, split_df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
    print(f"\n{split_name}:")
    for label in ['direct', 'intermediate', 'fully_evasive']:
        count = (split_df['eva4b_label'] == label).sum()
        pct = count / len(split_df) * 100
        print(f"  {label}: {count:,} ({pct:.1f}%)")

# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(['direct', 'intermediate', 'fully_evasive'])

y_train = label_encoder.transform(train_df['eva4b_label'])
y_val = label_encoder.transform(val_df['eva4b_label'])
y_test = label_encoder.transform(test_df['eva4b_label'])

print(f"\n✅ Labels encoded: {label_encoder.classes_}")

# Prepare text data
X_train_answers = train_df['answer'].values
X_val_answers = val_df['answer'].values
X_test_answers = test_df['answer'].values

X_train_questions = train_df['question'].values
X_val_questions = val_df['question'].values
X_test_questions = test_df['question'].values

# Combined Q+A
X_train_combined = train_df['question'] + " [SEP] " + train_df['answer']
X_val_combined = val_df['question'] + " [SEP] " + val_df['answer']
X_test_combined = test_df['question'] + " [SEP] " + test_df['answer']

print(f"\n✅ Text data prepared")
print(f"  Sample answer: {X_train_answers[0][:100]}...")
print(f"  Sample question: {X_train_questions[0][:100]}...")

# ============================================================================
# SECTION 3: FEATURE ENGINEERING FUNCTIONS
# ============================================================================

print("=" * 80)
print("SECTION 3: FEATURE ENGINEERING FUNCTIONS")
print("=" * 80)

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])
    print("✅ spaCy model loaded")
except Exception as e:
    print(f"⚠️ spaCy model not available: {e}")
    nlp = None

# Hedging and certainty lexicons
HEDGE_WORDS = {
    'might', 'could', 'possibly', 'maybe', 'perhaps', 'potentially',
    'approximately', 'roughly', 'about', 'around', 'somewhat',
    'uncertain', 'unsure', 'not certain', 'not sure', 'hard to say',
    'depends', 'various', 'a bit', 'to some extent', 'relatively',
    'fairly', 'quite', 'rather', 'somewhat', 'allegedly', 'apparently'
}

CERTAINTY_WORDS = {
    'definitely', 'certainly', 'exactly', 'precisely', 'absolutely',
    'clearly', 'undoubtedly', 'undisputably', 'surely', 'guarantee',
    'confirm', 'assure', 'always', 'never', 'every', 'all'
}

DEFLECTION_PHRASES = {
    'not sure', 'hard to say', 'depends on', 'difficult to predict',
    "can't say", 'unclear at this time', 'remains to be seen',
    'too early to tell', 'premature to', 'would not want to speculate'
}

def extract_text_length_features(text):
    """Extract basic text length features."""
    return {
        'char_count': len(text),
        'word_count': len(text.split()),
        'sentence_count': text.count('.') + text.count('!') + text.count('?'),
        'avg_word_length': np.mean([len(w) for w in text.split()]) if text.split() else 0,
    }

def extract_readability_features(text):
    """Extract readability scores."""
    features = {}
    
    if TEXTSTAT_AVAILABLE:
        try:
            features['flesch_ease'] = textstat.flesch_reading_ease(text)
            features['flesch_kincaid'] = textstat.flesch_kincaid_grade(text)
            features['smog'] = textstat.smog_index(text)
            features['ari'] = textstat.automated_readability_index(text)
        except:
            features['flesch_ease'] = 0
            features['flesch_kincaid'] = 0
            features['smog'] = 0
            features['ari'] = 0
    else:
        features['flesch_ease'] = 0
        features['flesch_kincaid'] = 0
        features['smog'] = 0
        features['ari'] = 0
    
    return features

def extract_sentiment_features(text):
    """Extract sentiment scores using TextBlob."""
    blob = TextBlob(text)
    return {
        'sentiment_polarity': blob.sentiment.polarity,
        'sentiment_subjectivity': blob.sentiment.subjectivity,
    }

def extract_hedging_features(text):
    """Extract hedging and certainty word counts."""
    words = set(text.lower().split())
    
    hedge_count = sum(1 for word in HEDGE_WORDS if word in text.lower())
    certainty_count = sum(1 for word in CERTAINTY_WORDS if word in text.lower())
    deflection_count = sum(1 for phrase in DEFLECTION_PHRASES if phrase in text.lower())
    
    word_count = len(text.split())
    
    return {
        'hedge_word_count': hedge_count,
        'certainty_word_count': certainty_count,
        'deflection_phrase_count': deflection_count,
        'hedge_ratio': hedge_count / word_count if word_count > 0 else 0,
        'certainty_ratio': certainty_count / word_count if word_count > 0 else 0,
    }

def extract_pos_features(text, nlp_model=nlp):
    """Extract POS tag ratios using spaCy."""
    if nlp_model is None:
        return {
            'adj_ratio': 0, 'adv_ratio': 0, 'noun_ratio': 0,
            'verb_ratio': 0, 'propn_ratio': 0
        }
    
    doc = nlp_model(text)
    pos_counts = Counter([token.pos_ for token in doc])
    total = len(doc)
    
    return {
        'adj_ratio': pos_counts.get('ADJ', 0) / total if total > 0 else 0,
        'adv_ratio': pos_counts.get('ADV', 0) / total if total > 0 else 0,
        'noun_ratio': pos_counts.get('NOUN', 0) / total if total > 0 else 0,
        'verb_ratio': pos_counts.get('VERB', 0) / total if total > 0 else 0,
        'propn_ratio': pos_counts.get('PROPN', 0) / total if total > 0 else 0,
    }

def extract_entity_features(text, nlp_model=nlp):
    """Extract named entity counts."""
    if nlp_model is None:
        return {
            'org_count': 0, 'money_count': 0, 'percent_count': 0,
            'number_count': 0, 'total_entities': 0
        }
    
    # Enable NER for this
    nlp_ner = spacy.load("en_core_web_sm", disable=['parser'])
    doc = nlp_ner(text)
    
    entity_counts = Counter([ent.label_ for ent in doc.ents])
    
    return {
        'org_count': entity_counts.get('ORG', 0),
        'money_count': entity_counts.get('MONEY', 0),
        'percent_count': entity_counts.get('PERCENT', 0),
        'number_count': entity_counts.get('CARDINAL', 0) + entity_counts.get('QUANTITY', 0),
        'total_entities': len(doc.ents),
    }

def extract_all_features(question, answer, include_q_features=True):
    """Extract all engineered features for a Q-A pair."""
    features = {}
    
    # Answer features
    answer_length = extract_text_length_features(answer)
    for k, v in answer_length.items():
        features[f'answer_{k}'] = v
    
    answer_readability = extract_readability_features(answer)
    for k, v in answer_readability.items():
        features[f'answer_{k}'] = v
    
    answer_sentiment = extract_sentiment_features(answer)
    for k, v in answer_sentiment.items():
        features[f'answer_{k}'] = v
    
    answer_hedging = extract_hedging_features(answer)
    for k, v in answer_hedging.items():
        features[f'answer_{k}'] = v
    
    if nlp:
        answer_pos = extract_pos_features(answer)
        for k, v in answer_pos.items():
            features[f'answer_{k}'] = v
    
    # Question features (optional)
    if include_q_features:
        question_length = extract_text_length_features(question)
        for k, v in question_length.items():
            features[f'question_{k}'] = v
        
        question_sentiment = extract_sentiment_features(question)
        for k, v in question_sentiment.items():
            features[f'question_{k}'] = v
    
    # Q-A interaction features
    features['length_ratio'] = features.get('answer_word_count', 1) / max(features.get('question_word_count', 1), 1)
    features['sentiment_diff'] = abs(
        features.get('answer_sentiment_polarity', 0) - 
        features.get('question_sentiment_polarity', 0)
    )
    
    return features

print("✅ Feature engineering functions defined")
print(f"   Expected feature count: ~30 features per Q-A pair")

# ============================================================================
# SECTION 4: TF-IDF BASELINES (LOGISTIC REGRESSION)
# ============================================================================

print("=" * 80)
print("SECTION 4: TF-IDF + LOGISTIC REGRESSION BASELINES")
print("=" * 80)

# Initialize MLflow
if MLFLOW_AVAILABLE:
    mlflow.set_experiment("EvasionBench_Traditional_ML_Baselines")
    print("✅ MLflow experiment initialized")

# Results storage
results = []

def evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test, 
                   model_name, feature_type):
    """Evaluate model and return metrics."""
    # Train
    model.fit(X_train, y_train)
    
    # Predictions
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    val_metrics = {
        'accuracy': accuracy_score(y_val, y_val_pred),
        'macro_f1': f1_score(y_val, y_val_pred, average='macro'),
        'weighted_f1': f1_score(y_val, y_val_pred, average='weighted'),
    }
    
    test_metrics = {
        'accuracy': accuracy_score(y_test, y_test_pred),
        'macro_f1': f1_score(y_test, y_test_pred, average='macro'),
        'weighted_f1': f1_score(y_test, y_test_pred, average='weighted'),
    }
    
    # Per-class F1
    per_class_f1 = f1_score(y_test, y_test_pred, average=None)
    
    return {
        'model_name': model_name,
        'feature_type': feature_type,
        'val_metrics': val_metrics,
        'test_metrics': test_metrics,
        'per_class_f1': per_class_f1,
        'y_test_pred': y_test_pred,
        'model': model
    }

# --- 4.1 Answer-only TF-IDF + Logistic Regression ---
print("\n" + "-" * 80)
print("4.1: Answer-only TF-IDF + Logistic Regression")
print("-" * 80)

# Create TF-IDF features
tfidf_answer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    stop_words='english'
)

X_train_tfidf_answer = tfidf_answer.fit_transform(X_train_answers)
X_val_tfidf_answer = tfidf_answer.transform(X_val_answers)
X_test_tfidf_answer = tfidf_answer.transform(X_test_answers)

print(f"TF-IDF shape: {X_train_tfidf_answer.shape}")

# Train Logistic Regression
lr_answer = LogisticRegression(
    C=1.0,
    max_iter=1000,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

result_lr_answer = evaluate_model(
    lr_answer, X_train_tfidf_answer, y_train,
    X_val_tfidf_answer, y_val, X_test_tfidf_answer, y_test,
    "Logistic Regression", "TF-IDF (Answer-only)"
)

results.append(result_lr_answer)

print(f"\nValidation Metrics:")
print(f"  Accuracy:  {result_lr_answer['val_metrics']['accuracy']:.4f}")
print(f"  Macro-F1:  {result_lr_answer['val_metrics']['macro_f1']:.4f}")
print(f"  Weighted-F1: {result_lr_answer['val_metrics']['weighted_f1']:.4f}")

print(f"\nTest Metrics:")
print(f"  Accuracy:  {result_lr_answer['test_metrics']['accuracy']:.4f}")
print(f"  Macro-F1:  {result_lr_answer['test_metrics']['macro_f1']:.4f}")
print(f"  Weighted-F1: {result_lr_answer['test_metrics']['weighted_f1']:.4f}")

print(f"\nPer-class F1 (Test):")
for i, label in enumerate(label_encoder.classes_):
    print(f"  {label}: {result_lr_answer['per_class_f1'][i]:.4f}")

# --- 4.2 Question-only TF-IDF + Logistic Regression ---
print("\n" + "-" * 80)
print("4.2: Question-only TF-IDF + Logistic Regression")
print("-" * 80)

tfidf_question = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    stop_words='english'
)

X_train_tfidf_question = tfidf_question.fit_transform(X_train_questions)
X_val_tfidf_question = tfidf_question.transform(X_val_questions)
X_test_tfidf_question = tfidf_question.transform(X_test_questions)

lr_question = LogisticRegression(
    C=1.0,
    max_iter=1000,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

result_lr_question = evaluate_model(
    lr_question, X_train_tfidf_question, y_train,
    X_val_tfidf_question, y_val, X_test_tfidf_question, y_test,
    "Logistic Regression", "TF-IDF (Question-only)"
)

results.append(result_lr_question)

print(f"\nTest Metrics:")
print(f"  Accuracy:  {result_lr_question['test_metrics']['accuracy']:.4f}")
print(f"  Macro-F1:  {result_lr_question['test_metrics']['macro_f1']:.4f}")
print(f"  Per-class F1: {result_lr_question['per_class_f1']}")

# --- 4.3 Combined Q+A TF-IDF + Logistic Regression ---
print("\n" + "-" * 80)
print("4.3: Combined Q+A TF-IDF + Logistic Regression")
print("-" * 80)

tfidf_combined = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    stop_words='english'
)

X_train_tfidf_combined = tfidf_combined.fit_transform(X_train_combined)
X_val_tfidf_combined = tfidf_combined.transform(X_val_combined)
X_test_tfidf_combined = tfidf_combined.transform(X_test_combined)

lr_combined = LogisticRegression(
    C=1.0,
    max_iter=1000,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

result_lr_combined = evaluate_model(
    lr_combined, X_train_tfidf_combined, y_train,
    X_val_tfidf_combined, y_val, X_test_tfidf_combined, y_test,
    "Logistic Regression", "TF-IDF (Q+A Combined)"
)

results.append(result_lr_combined)

print(f"\nTest Metrics:")
print(f"  Accuracy:  {result_lr_combined['test_metrics']['accuracy']:.4f}")
print(f"  Macro-F1:  {result_lr_combined['test_metrics']['macro_f1']:.4f}")
print(f"  Per-class F1: {result_lr_combined['per_class_f1']}")

# --- 4.4 Hyperparameter Tuning for Best TF-IDF Config ---
print("\n" + "-" * 80)
print("4.4: Hyperparameter Tuning (TF-IDF + LR)")
print("-" * 80)

best_val_f1 = 0
best_config = None
best_model = None

# Grid search over key hyperparameters
param_grid = {
    'C': [0.1, 0.5, 1.0, 2.0, 5.0],
    'max_features': [5000, 10000, 15000],
    'ngram_range': [(1, 1), (1, 2)]
}

print("Performing grid search...")
for C in param_grid['C']:
    for max_feat in param_grid['max_features']:
        for ngram in param_grid['ngram_range']:
            # Create TF-IDF
            tfidf = TfidfVectorizer(
                max_features=max_feat,
                ngram_range=ngram,
                min_df=2,
                max_df=0.95,
                stop_words='english'
            )
            
            X_train_tune = tfidf.fit_transform(X_train_combined)
            X_val_tune = tfidf.transform(X_val_combined)
            
            # Train LR
            lr = LogisticRegression(
                C=C,
                max_iter=1000,
                class_weight='balanced',
                random_state=42,
                n_jobs=-1
            )
            
            lr.fit(X_train_tune, y_train)
            y_val_pred = lr.predict(X_val_tune)
            val_f1 = f1_score(y_val, y_val_pred, average='macro')
            
            if val_f1 > best_val_f1:
                best_val_f1 = val_f1
                best_config = {'C': C, 'max_features': max_feat, 'ngram_range': ngram}
                best_model = lr
                best_tfidf = tfidf

print(f"\n✅ Best configuration found:")
print(f"  C: {best_config['C']}")
print(f"  max_features: {best_config['max_features']}")
print(f"  ngram_range: {best_config['ngram_range']}")
print(f"  Best val macro-F1: {best_val_f1:.4f}")

# Evaluate best model on test set
X_test_tune = best_tfidf.transform(X_test_combined)
y_test_pred_tuned = best_model.predict(X_test_tune)
test_f1_tuned = f1_score(y_test, y_test_pred_tuned, average='macro')

result_lr_tuned = {
    'model_name': 'Logistic Regression (Tuned)',
    'feature_type': 'TF-IDF (Q+A Combined)',
    'val_metrics': {'macro_f1': best_val_f1},
    'test_metrics': {
        'accuracy': accuracy_score(y_test, y_test_pred_tuned),
        'macro_f1': test_f1_tuned,
        'weighted_f1': f1_score(y_test, y_test_pred_tuned, average='weighted')
    },
    'per_class_f1': f1_score(y_test, y_test_pred_tuned, average=None),
    'y_test_pred': y_test_pred_tuned,
    'model': best_model
}

results.append(result_lr_tuned)

print(f"\nTest Metrics (Tuned):")
print(f"  Accuracy:  {result_lr_tuned['test_metrics']['accuracy']:.4f}")
print(f"  Macro-F1:  {result_lr_tuned['test_metrics']['macro_f1']:.4f}")
print(f"  Per-class F1: {result_lr_tuned['per_class_f1']}")

# ============================================================================
# SECTION 5: XGBOOST BASELINE
# ============================================================================

print("=" * 80)
print("SECTION 5: TF-IDF + XGBOOST BASELINE")
print("=" * 80)

if not XGBOOST_AVAILABLE:
    print("⚠️ Skipping XGBoost - library not available")
else:
    # --- 5.1 Basic XGBoost ---
    print("\n" + "-" * 80)
    print("5.1: XGBoost with TF-IDF (Combined Q+A)")
    print("-" * 80)
    
    xgb_model = xgb.XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        eval_metric='mlogloss',
        use_label_encoder=False
    )
    
    result_xgb = evaluate_model(
        xgb_model, X_train_tfidf_combined, y_train,
        X_val_tfidf_combined, y_val, X_test_tfidf_combined, y_test,
        "XGBoost", "TF-IDF (Q+A Combined)"
    )
    
    results.append(result_xgb)
    
    print(f"\nTest Metrics:")
    print(f"  Accuracy:  {result_xgb['test_metrics']['accuracy']:.4f}")
    print(f"  Macro-F1:  {result_xgb['test_metrics']['macro_f1']:.4f}")
    print(f"  Per-class F1: {result_xgb['per_class_f1']}")
    
    # --- 5.2 Feature Importance Analysis ---
    print("\n" + "-" * 80)
    print("5.2: XGBoost Feature Importance")
    print("-" * 80)
    
    # Get feature importance
    importance_dict = xgb_model.get_booster().get_score(importance_type='weight')
    
    # Map feature indices to names
    feature_names = tfidf_combined.get_feature_names_out()
    
    # Sort by importance
    sorted_importance = sorted(
        [(feature_names[int(k[1:])], v) for k, v in importance_dict.items()],
        key=lambda x: x[1],
        reverse=True
    )[:20]
    
    print("\nTop 20 most important features:")
    for i, (feature, importance) in enumerate(sorted_importance, 1):
        print(f"  {i:2d}. {feature}: {importance:.2f}")
    
    # Visualize feature importance
    fig, ax = plt.subplots(figsize=(10, 8))
    features, importances = zip(*sorted_importance[:15])
    ax.barh(range(len(features)), importances, align='center')
    ax.set_yticks(range(len(features)))
    ax.set_yticklabels(features)
    ax.invert_yaxis()
    ax.set_xlabel('Feature Importance (Weight)', fontsize=12)
    ax.set_title('XGBoost Top 15 Feature Importance', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('notebooks/figures/04_xgboost_feature_importance.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    # --- 5.3 XGBoost Hyperparameter Tuning ---
    print("\n" + "-" * 80)
    print("5.3: XGBoost Hyperparameter Tuning")
    print("-" * 80)
    
    best_xgb_val_f1 = 0
    best_xgb_config = None
    best_xgb_model = None
    
    xgb_param_grid = {
        'max_depth': [4, 6, 8],
        'learning_rate': [0.05, 0.1, 0.2],
        'n_estimators': [100, 200, 300]
    }
    
    print("Performing grid search (simplified)...")
    for max_depth in [4, 6, 8]:
        for lr in [0.05, 0.1]:
            xgb_tune = xgb.XGBClassifier(
                n_estimators=200,
                max_depth=max_depth,
                learning_rate=lr,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                n_jobs=-1,
                eval_metric='mlogloss',
                use_label_encoder=False
            )
            
            xgb_tune.fit(X_train_tfidf_combined, y_train)
            y_val_pred = xgb_tune.predict(X_val_tfidf_combined)
            val_f1 = f1_score(y_val, y_val_pred, average='macro')
            
            if val_f1 > best_xgb_val_f1:
                best_xgb_val_f1 = val_f1
                best_xgb_config = {'max_depth': max_depth, 'learning_rate': lr}
                best_xgb_model = xgb_tune
    
    print(f"\n✅ Best XGBoost configuration:")
    print(f"  max_depth: {best_xgb_config['max_depth']}")
    print(f"  learning_rate: {best_xgb_config['learning_rate']}")
    print(f"  Best val macro-F1: {best_xgb_val_f1:.4f}")
    
    # Evaluate on test
    y_test_pred_xgb = best_xgb_model.predict(X_test_tfidf_combined)
    
    result_xgb_tuned = {
        'model_name': 'XGBoost (Tuned)',
        'feature_type': 'TF-IDF (Q+A Combined)',
        'val_metrics': {'macro_f1': best_xgb_val_f1},
        'test_metrics': {
            'accuracy': accuracy_score(y_test, y_test_pred_xgb),
            'macro_f1': f1_score(y_test, y_test_pred_xgb, average='macro'),
            'weighted_f1': f1_score(y_test, y_test_pred_xgb, average='weighted')
        },
        'per_class_f1': f1_score(y_test, y_test_pred_xgb, average=None),
        'y_test_pred': y_test_pred_xgb,
        'model': best_xgb_model
    }
    
    results.append(result_xgb_tuned)
    
    print(f"\nTest Metrics (Tuned XGBoost):")
    print(f"  Accuracy:  {result_xgb_tuned['test_metrics']['accuracy']:.4f}")
    print(f"  Macro-F1:  {result_xgb_tuned['test_metrics']['macro_f1']:.4f}")
    print(f"  Per-class F1: {result_xgb_tuned['per_class_f1']}")

# ============================================================================
# SECTION 6: ENGINEERED FEATURES EXTRACTION
# ============================================================================

print("=" * 80)
print("SECTION 6: ENGINEERED FEATURES EXTRACTION")
print("=" * 80)

print("\nExtracting engineered features (this may take a few minutes)...")

def extract_features_batch(questions, answers, desc="Extracting"):
    """Extract features for a batch of Q-A pairs."""
    features_list = []
    
    for q, a in tqdm(zip(questions, answers), total=len(questions), desc=desc):
        features = extract_all_features(q, a, include_q_features=True)
        features_list.append(features)
    
    return pd.DataFrame(features_list)

# Extract features for all splits
print("\nExtracting train features...")
train_features_df = extract_features_batch(X_train_questions, X_train_answers, "Train")

print("\nExtracting val features...")
val_features_df = extract_features_batch(X_val_questions, X_val_answers, "Val")

print("\nExtracting test features...")
test_features_df = extract_features_batch(X_test_questions, X_test_answers, "Test")

print(f"\n✅ Feature extraction complete")
print(f"  Train features shape: {train_features_df.shape}")
print(f"  Val features shape: {val_features_df.shape}")
print(f"  Test features shape: {test_features_df.shape}")

# Display feature names
print(f"\nFeature names ({len(train_features_df.columns)} total):")
for i, col in enumerate(train_features_df.columns, 1):
    print(f"  {i:2d}. {col}")

# Handle missing values
train_features_df = train_features_df.fillna(0)
val_features_df = val_features_df.fillna(0)
test_features_df = test_features_df.fillna(0)

# Scale features
scaler = StandardScaler()
X_train_eng = scaler.fit_transform(train_features_df)
X_val_eng = scaler.transform(val_features_df)
X_test_eng = scaler.transform(test_features_df)
