In [None]:
# ==============================================================================
# 1. CONFIGURATION & IMPORTS
# ==============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import optuna
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance
from sklearn.metrics import roc_auc_score, average_precision_score
from src.data.data_loader import ClinicalTrialLoader
from src.data.preprocessing import get_preprocessor
from src.models.train_test_split import temporal_train_test_split

# Optuna verbosity
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Load Data (Standard MVP Logic)
import os
import sys
current_dir = os.getcwd()
project_root = current_dir
while not os.path.exists(os.path.join(project_root, 'src')):
    project_root = os.path.dirname(project_root)
if project_root not in sys.path:
    sys.path.append(project_root)
DATA_PATH = os.path.join(project_root, "data")
CSV_PATH = os.path.join(DATA_PATH, 'project_data.csv')

print("Loading Data...")
df = pd.read_csv(CSV_PATH)

# Temporal Split
X_train, X_test, y_train, y_test = temporal_train_test_split(df, train_ratio=0.8)

# Calculate Imbalance Ratio
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
print(f"Data Loaded. Train Shape: {X_train.shape}. Scale Pos Weight: {ratio:.2f}")

# ==============================================================================
# 2. FEATURE SELECTION: THE PURGE
# ==============================================================================
# We train a "Restrained" XGBoost model first.
# We reduce max_depth to 4 to force it to pick only strong features, not noise.

print("\n>>> Phase 1: Feature Selection (Permutation Importance)")

# 1. Preprocess Data separately so we can access column names
preprocessor = get_preprocessor()
X_train_proc = preprocessor.fit_transform(X_train, y_train)
X_test_proc = preprocessor.transform(X_test)

# Get Feature Names
# Note: This logic handles the complex pipeline output
try:
    # Try standard sklearn get_feature_names_out
    feature_names = preprocessor.get_feature_names_out()
except:
    # Fallback if pipeline structure varies
    from src.data.preprocessing import get_feature_names
    feature_names = get_feature_names(preprocessor)

print(f"Total Features before selection: {len(feature_names)}")

# 2. Train Diagnostic Model
# High learning rate, low depth to find dominant signals quickly
selector_model = XGBClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    scale_pos_weight=ratio,
    random_state=42,
    n_jobs=-1
)
selector_model.fit(X_train_proc, y_train)

# 3. Calculate Permutation Importance
# This shuffles each column and sees how much ROC_AUC drops
print("Computing Permutation Importance (this may take a minute)...")
result = permutation_importance(
    selector_model,
    X_test_proc,
    y_test,
    n_repeats=5,
    random_state=42,
    scoring='roc_auc',
    n_jobs=-1
)

# 4. Identify Toxic Features
# Importance < 0 means the feature actually CONFUSED the model (removing it improves score)
# Importance == 0 means the feature does nothing
importances = result.importances_mean
indices = np.argsort(importances)[::-1]

keep_features = []
drop_features = []

print("\n--- TOP 10 FEATURES ---")
for i in range(10):
    print(f"{feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

print("\n--- TOXIC FEATURES (Negative Importance) ---")
for i in range(len(feature_names)):
    if importances[i] <= 0:
        drop_features.append(feature_names[i])
    else:
        keep_features.append(feature_names[i])

print(f"Identified {len(drop_features)} features to drop.")
print(f"Remaining Features: {len(keep_features)}")

# ==============================================================================
# 3. HYPERPARAMETER OPTIMIZATION: OPTUNA
# ==============================================================================
print("\n>>> Phase 2: Hyperparameter Tuning (Bayesian Optimization)")

def objective(trial):
    # 1. Hyperparameter Space
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5), # Pruning parameter
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10), # L1 Reg
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10), # L2 Reg
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, ratio * 1.5), # Tuning the weight
        'booster': 'gbtree',
        'tree_method': 'auto',
        'n_jobs': -1,
        'random_state': 42
    }

    # 2. Train Model
    model = XGBClassifier(**param)

    # NOTE: In a real scenario, we would filter X_train_proc to 'keep_features' here
    # For this MVP step, we will run on full features to establish the optimization baseline
    # or you can implement the column slicing if you extract indices of keep_features.

    model.fit(X_train_proc, y_train)

    # 3. Evaluate
    preds = model.predict_proba(X_test_proc)[:, 1]
    auc = roc_auc_score(y_test, preds)

    return auc

# Run Optimization
study = optuna.create_study(direction='maximize')
print("Starting Optuna Study (20 Trials)...")
study.optimize(objective, n_trials=20) # 20 trials for speed, increase to 50+ for production

print("\n================================================================================")
print(" OPTIMIZATION RESULTS")
print("================================================================================")
print(f"Best ROC-AUC: {study.best_value:.4f}")
print("Best Hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

# ==============================================================================
# 4. TRAIN & SAVE FINAL MODEL
# ==============================================================================
print("\n>>> Phase 3: Training Final Model")

best_params = study.best_params
final_model = XGBClassifier(**best_params, n_jobs=-1, random_state=42)
final_model.fit(X_train_proc, y_train)

# Evaluation
final_preds = final_model.predict_proba(X_test_proc)[:, 1]
final_auc = roc_auc_score(y_test, final_preds)
final_pr = average_precision_score(y_test, final_preds)

print(f"Final Test ROC-AUC: {final_auc:.4f}")
print(f"Final Test PR-AUC:  {final_pr:.4f}")

# Save
# Note: We save the classifier separately or wrap it back in a pipeline if needed
# For now, saving just the classifier to demonstrate
joblib.dump(final_model, 'src/models/xgboost_optimized.joblib')
print("Optimized model saved.")