In [None]:
# Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import sys
import time
import warnings
warnings.filterwarnings('ignore')

repo_root = Path().resolve().parents[2]
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

from modules._import_helper import safe_import_from
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
import lightgbm as lgb

set_seed = safe_import_from('00_repo_standards.src.mlphys_core.seeding', 'set_seed')
load_data, get_feature_columns, split_data = safe_import_from(
    '03_ml_tabular_foundations.src.data',
    'load_data', 'get_feature_columns', 'split_data'
)
create_lightgbm_pipeline = safe_import_from(
    '03_ml_tabular_foundations.src.models',
    'create_lightgbm_pipeline'
)

set_seed(42)
reports_dir = Path("../reports")
reports_dir.mkdir(exist_ok=True)

print("‚úÖ Setup complete")

## 1. Why Gradient Boosting?

**Gradient Boosting Machines (GBM)**: Ensemble of decision trees trained sequentially.

**Advantages**:
- **Handles feature interactions** (non-linear relationships)
- **Mixed data types** (continuous, categorical, missing values)
- **Feature importance** (built-in explainability)
- **State-of-the-art tabular performance** (Kaggle winner)

**LightGBM specifics**:
- Fast training (histogram-based)
- Memory efficient
- Handles large datasets (millions of rows)
- Native categorical feature support

In [None]:
# Load and split data
df = load_data()
feature_cols = get_feature_columns(df)
X = df[feature_cols].values
y = df['is_signal'].values

X_train, X_val, X_test, y_train, y_val, y_test = split_data(
    X, y, test_size=0.2, val_size=0.2, random_state=42
)

print(f"Dataset: {X.shape[0]:,} samples, {X.shape[1]} features")
print(f"Split sizes: Train={len(y_train):,}, Val={len(y_val):,}, Test={len(y_test):,}")

## 2. Key LightGBM Hyperparameters

**Understanding hyperparameters is critical** for effective tuning.

**Major categories**:

1. **Tree Structure**:
   - `num_leaves`: Complexity (more leaves = more complex)
   - `max_depth`: Tree depth limit (prevent overfitting)
   - `min_child_samples`: Minimum samples per leaf (regularization)

2. **Boosting**:
   - `n_estimators`: Number of trees (more = better fit, but slower)
   - `learning_rate`: Shrinkage (lower = more robust, but need more trees)

3. **Regularization**:
   - `reg_alpha` (L1): Feature selection
   - `reg_lambda` (L2): Smooth weights

4. **Sampling**:
   - `subsample`: Row sampling (< 1.0 adds randomness)
   - `colsample_bytree`: Column sampling (feature subset)

## 3. Baseline GBM: Default Parameters

Start with sensible defaults, measure performance.

In [None]:
# Train default GBM
print("Training Baseline GBM (default parameters)...")
print("=" * 60)

start_time = time.time()

# Create pipeline with default parameters
pipeline_default = create_lightgbm_pipeline(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=-1,  # No limit
    num_leaves=31,
    random_state=42
)

pipeline_default.fit(X_train, y_train)
train_time = time.time() - start_time

# Evaluate
y_pred_train = pipeline_default.predict_proba(X_train)[:, 1]
y_pred_val = pipeline_default.predict_proba(X_val)[:, 1]
y_pred_test = pipeline_default.predict_proba(X_test)[:, 1]

metrics_default = {
    'train_auc': roc_auc_score(y_train, y_pred_train),
    'val_auc': roc_auc_score(y_val, y_pred_val),
    'test_auc': roc_auc_score(y_test, y_pred_test),
    'train_pr_auc': average_precision_score(y_train, y_pred_train),
    'val_pr_auc': average_precision_score(y_val, y_pred_val),
    'test_pr_auc': average_precision_score(y_test, y_pred_test),
    'train_time': train_time
}

print(f"Training time: {train_time:.2f}s")
print(f"\nPerformance:")
print(f"  Train AUC: {metrics_default['train_auc']:.4f}")
print(f"  Val AUC:   {metrics_default['val_auc']:.4f}")
print(f"  Test AUC:  {metrics_default['test_auc']:.4f}")
print(f"\n  Train PR-AUC: {metrics_default['train_pr_auc']:.4f}")
print(f"  Val PR-AUC:   {metrics_default['val_pr_auc']:.4f}")
print(f"  Test PR-AUC:  {metrics_default['test_pr_auc']:.4f}")

overfitting_gap = metrics_default['train_auc'] - metrics_default['val_auc']
print(f"\nüìä Overfitting gap: {overfitting_gap:.4f}")
if overfitting_gap > 0.05:
    print("  ‚ö†Ô∏è Significant overfitting ‚Üí need regularization")
else:
    print("  ‚úÖ Reasonable generalization")

## 4. Learning Curves: Diagnosing Over/Underfitting

**Learning curve**: Performance vs. training set size.

**Interpretation**:
- **High train, low val**: Overfitting (model too complex)
- **Low train, low val**: Underfitting (model too simple)
- **Converging**: Good fit (sweet spot)

In [None]:
# Generate learning curve
train_sizes = np.linspace(0.1, 1.0, 10)
train_scores = []
val_scores = []

print("Generating learning curve...")
for size in train_sizes:
    n_samples = int(size * len(X_train))
    X_subset = X_train[:n_samples]
    y_subset = y_train[:n_samples]
    
    # Train model
    pipeline_lc = create_lightgbm_pipeline(n_estimators=100, learning_rate=0.1, random_state=42)
    pipeline_lc.fit(X_subset, y_subset)
    
    # Evaluate
    train_pred = pipeline_lc.predict_proba(X_subset)[:, 1]
    val_pred = pipeline_lc.predict_proba(X_val)[:, 1]
    
    train_scores.append(roc_auc_score(y_subset, train_pred))
    val_scores.append(roc_auc_score(y_val, val_pred))

# Plot learning curve
fig, ax = plt.subplots(figsize=(10, 6))
train_samples = (train_sizes * len(X_train)).astype(int)

ax.plot(train_samples, train_scores, 'o-', linewidth=2.5, label='Train', color='steelblue', markersize=8)
ax.plot(train_samples, val_scores, 's-', linewidth=2.5, label='Validation', color='coral', markersize=8)
ax.set_xlabel('Training Set Size', fontsize=12)
ax.set_ylabel('AUC-ROC', fontsize=12)
ax.set_title('Learning Curve: LightGBM', fontsize=13, fontweight='bold')
ax.legend(fontsize=11, loc='lower right')
ax.grid(alpha=0.3)
ax.set_ylim([0.85, 1.0])

plt.tight_layout()
plt.savefig(reports_dir / '04_learning_curve.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"üìä Learning Curve Analysis:")
print(f"  ‚Ä¢ Final train AUC: {train_scores[-1]:.4f}")
print(f"  ‚Ä¢ Final val AUC: {val_scores[-1]:.4f}")
print(f"  ‚Ä¢ Gap: {train_scores[-1] - val_scores[-1]:.4f}")
if val_scores[-1] < val_scores[-2]:
    print("  ‚Ä¢ Val performance plateaued ‚Üí diminishing returns from more data")
else:
    print("  ‚Ä¢ Val performance still improving ‚Üí more data could help")

## 5. Feature Importance Analysis

**Built-in importance**: LightGBM tracks how much each feature reduces loss.

**Interpretation**:
- **Gain**: Average reduction in loss when feature is used
- **Split**: Number of times feature is used in trees

**Use case**: Feature selection, domain validation, explainability.

In [None]:
# Extract feature importance
lgb_model = pipeline_default.named_steps['classifier']
feature_importance = lgb_model.feature_importances_
feature_names = [f"F{i}_{name}" for i, name in enumerate(feature_cols)]

# Sort by importance
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
}).sort_values('Importance', ascending=False)

# Plot top 10 features
fig, ax = plt.subplots(figsize=(10, 7))
top_n = 10
top_features = importance_df.head(top_n)

ax.barh(range(top_n), top_features['Importance'].values, alpha=0.8, 
        color='steelblue', edgecolor='black', linewidth=1.5)
ax.set_yticks(range(top_n))
ax.set_yticklabels(top_features['Feature'].values, fontsize=10)
ax.set_xlabel('Feature Importance (Gain)', fontsize=12)
ax.set_title(f'Top {top_n} Most Important Features', fontsize=13, fontweight='bold')
ax.grid(alpha=0.3, axis='x')
ax.invert_yaxis()

plt.tight_layout()
plt.savefig(reports_dir / '04_feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"üìä Feature Importance (Top 5):")
for idx, row in importance_df.head(5).iterrows():
    print(f"  {row['Feature']:<25} {row['Importance']:.4f}")

print(f"\nüí° Interpretation:")
print(f"  ‚Ä¢ Top feature: {importance_df.iloc[0]['Feature']}")
print(f"  ‚Ä¢ Top 5 features account for {importance_df.head(5)['Importance'].sum() / importance_df['Importance'].sum() * 100:.1f}% of total importance")

## 6. Hyperparameter Tuning: Randomized Search

**Strategy**: Random search over parameter distributions.

**Why random over grid**:
- More efficient (explores wider range)
- Better for high-dimensional spaces
- Anytime algorithm (can stop early)

**Tuning budget**: Limited to 20 iterations (< 3 min runtime).

In [None]:
# Define search space
param_distributions = {
    'classifier__n_estimators': [100, 200, 300, 500],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__max_depth': [5, 7, 10, -1],
    'classifier__num_leaves': [15, 31, 63, 127],
    'classifier__min_child_samples': [10, 20, 50],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.8, 1.0],
    'classifier__reg_alpha': [0.0, 0.01, 0.1],
    'classifier__reg_lambda': [0.0, 0.01, 0.1]
}

print("Hyperparameter Tuning: Randomized Search")
print("=" * 60)
print(f"Search space: {np.prod([len(v) for v in param_distributions.values()]):,} combinations")
print(f"Tuning budget: 20 iterations √ó 3-fold CV = 60 fits")
print(f"Estimated time: ~2 minutes")
print("\nSearching...")

# Create base pipeline
base_pipeline = create_lightgbm_pipeline(random_state=42)

# Randomized search with 3-fold CV (faster than 5-fold)
search = RandomizedSearchCV(
    base_pipeline,
    param_distributions,
    n_iter=20,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42,
    verbose=0
)

start_time = time.time()
search.fit(X_train, y_train)
search_time = time.time() - start_time

print(f"\n‚úÖ Search complete in {search_time:.1f}s")
print(f"\nBest parameters:")
for param, value in search.best_params_.items():
    print(f"  {param}: {value}")

print(f"\nBest CV score: {search.best_score_:.4f}")

## 7. Evaluate Tuned Model

Compare tuned model against baseline.

In [None]:
# Evaluate tuned model
best_pipeline = search.best_estimator_

y_pred_train_tuned = best_pipeline.predict_proba(X_train)[:, 1]
y_pred_val_tuned = best_pipeline.predict_proba(X_val)[:, 1]
y_pred_test_tuned = best_pipeline.predict_proba(X_test)[:, 1]

metrics_tuned = {
    'train_auc': roc_auc_score(y_train, y_pred_train_tuned),
    'val_auc': roc_auc_score(y_val, y_pred_val_tuned),
    'test_auc': roc_auc_score(y_test, y_pred_test_tuned),
    'train_pr_auc': average_precision_score(y_train, y_pred_train_tuned),
    'val_pr_auc': average_precision_score(y_val, y_pred_val_tuned),
    'test_pr_auc': average_precision_score(y_test, y_pred_test_tuned)
}

# Comparison table
comparison = pd.DataFrame({
    'Model': ['Default GBM', 'Tuned GBM'],
    'Train AUC': [metrics_default['train_auc'], metrics_tuned['train_auc']],
    'Val AUC': [metrics_default['val_auc'], metrics_tuned['val_auc']],
    'Test AUC': [metrics_default['test_auc'], metrics_tuned['test_auc']],
    'PR-AUC': [metrics_default['test_pr_auc'], metrics_tuned['test_pr_auc']],
    'Train Time': [f"{metrics_default['train_time']:.2f}s", f"{search_time:.1f}s"]
})

print("\nüìä Model Comparison:")
print("=" * 60)
print(comparison.to_string(index=False))

improvement = (metrics_tuned['test_auc'] - metrics_default['test_auc']) * 100
print(f"\nüí° Tuning improvement: {improvement:+.2f} percentage points AUC")

if improvement > 1.0:
    print("  ‚úÖ Significant improvement from tuning")
elif improvement > 0:
    print("  ‚úÖ Modest improvement from tuning")
else:
    print("  ‚ö†Ô∏è Tuning didn't help (default was already good)")

# Save comparison
comparison.to_csv(reports_dir / '04_tuning_comparison.csv', index=False)
print(f"\n‚úÖ Comparison saved to: {reports_dir / '04_tuning_comparison.csv'}")

## 8. Hyperparameter Sensitivity Analysis

**Goal**: Understand which hyperparameters matter most.

**Method**: Plot CV score vs. each hyperparameter value.

In [None]:
# Extract search results
results_df = pd.DataFrame(search.cv_results_)

# Plot top 3 hyperparameters by importance
important_params = ['classifier__n_estimators', 'classifier__learning_rate', 'classifier__max_depth']

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for idx, param in enumerate(important_params):
    ax = axes[idx]
    
    # Extract param values and scores
    param_values = results_df[f'param_{param}']
    scores = results_df['mean_test_score']
    
    # Convert to numeric if needed
    if param_values.dtype == 'object':
        param_values = pd.to_numeric(param_values, errors='coerce')
    
    # Scatter plot
    ax.scatter(param_values, scores, alpha=0.6, s=100, color='steelblue', edgecolor='black', linewidth=1)
    ax.set_xlabel(param.replace('classifier__', ''), fontsize=11)
    ax.set_ylabel('CV AUC-ROC' if idx == 0 else '', fontsize=11)
    ax.set_title(param.replace('classifier__', '').replace('_', ' ').title(), fontsize=12, fontweight='bold')
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig(reports_dir / '04_hyperparameter_sensitivity.png', dpi=150, bbox_inches='tight')
plt.show()

print("üìä Hyperparameter Sensitivity:")
print("  ‚Ä¢ n_estimators: More trees ‚Üí better performance (diminishing returns)")
print("  ‚Ä¢ learning_rate: Lower rate needs more trees (trade-off)")
print("  ‚Ä¢ max_depth: Deeper trees ‚Üí more complex (risk overfitting)")

## 9. Experiment Tracking Best Practices

**Professional workflow**:
1. Log all hyperparameters
2. Track metrics (train/val/test)
3. Save best model
4. Version control configs

In [None]:
# Save experiment config and results
experiment_log = {
    'experiment_name': 'lightgbm_tuning',
    'date': pd.Timestamp.now().isoformat(),
    'dataset': {
        'n_samples': len(X),
        'n_features': X.shape[1],
        'signal_rate': float(y.mean())
    },
    'default_model': {
        'params': {
            'n_estimators': 100,
            'learning_rate': 0.1,
            'max_depth': -1,
            'num_leaves': 31
        },
        'metrics': {
            'train_auc': float(metrics_default['train_auc']),
            'val_auc': float(metrics_default['val_auc']),
            'test_auc': float(metrics_default['test_auc'])
        }
    },
    'tuned_model': {
        'params': {k.replace('classifier__', ''): v for k, v in search.best_params_.items()},
        'metrics': {
            'train_auc': float(metrics_tuned['train_auc']),
            'val_auc': float(metrics_tuned['val_auc']),
            'test_auc': float(metrics_tuned['test_auc'])
        },
        'cv_score': float(search.best_score_)
    },
    'tuning': {
        'method': 'RandomizedSearchCV',
        'n_iterations': 20,
        'cv_folds': 3,
        'search_time_seconds': float(search_time)
    }
}

# Save as JSON
import json
with open(reports_dir / '04_experiment_log.json', 'w') as f:
    json.dump(experiment_log, f, indent=2)

print("üìã Experiment Log:")
print("=" * 60)
print(json.dumps(experiment_log, indent=2))
print(f"\n‚úÖ Experiment log saved to: {reports_dir / '04_experiment_log.json'}")

## 10. Exercises

**Exercise 1**: Interpret overfitting

Given a model with:
- Train AUC = 0.995
- Val AUC = 0.920
- Test AUC = 0.915

Is the model overfitting? What would you do to fix it?

In [None]:
# Your answer here:
# 


**Exercise 2**: Design a hyperparameter search

You have a limited budget (30 minutes) to tune a GBM model.

Design a search strategy:
1. Which hyperparameters to prioritize?
2. What ranges to search?
3. How many iterations?
4. Grid search or random search?

In [None]:
# Your answer here:
# 


**Exercise 3**: Feature importance validation

You notice that `transaction_id` (unique identifier) has high feature importance in your fraud detection model.

What does this indicate? How would you investigate and fix it?

In [None]:
# Your answer here:
# 


---
## Solutions

**Solution 1**: Interpret overfitting

**Answer**: Yes, significant overfitting.

**Evidence**:
- Train-Val gap: 0.995 - 0.920 = 0.075 (7.5 percentage points)
- Model memorizing training data
- Val/Test performance similar ‚Üí gap is train overfitting, not val overfitting

**Root causes**:
1. Model too complex (too many trees, too deep)
2. Insufficient regularization
3. Not enough training data

**Fixes** (in order of priority):

1. **Early stopping**:
   ```python
   lgb_model.fit(X_train, y_train, 
                 eval_set=[(X_val, y_val)],
                 early_stopping_rounds=50,
                 verbose=False)
   # Stops when val performance stops improving
   ```

2. **Regularization**:
   - Increase `min_child_samples` (e.g., 20 ‚Üí 100)
   - Increase `reg_alpha`, `reg_lambda` (0.0 ‚Üí 0.1)
   - Reduce `num_leaves` (127 ‚Üí 31)

3. **Ensemble diversity**:
   - Lower `subsample` (1.0 ‚Üí 0.8)
   - Lower `colsample_bytree` (1.0 ‚Üí 0.8)
   
4. **Reduce capacity**:
   - Fewer trees (`n_estimators` 500 ‚Üí 200)
   - Shallower trees (`max_depth` -1 ‚Üí 5)

5. **More data**:
   - Collect more training samples
   - Data augmentation (if applicable)

**Expected outcome**: Train AUC ‚âà 0.93, Val AUC ‚âà 0.92 (gap < 0.02)

In [None]:
# Solution 1 (code demonstration)
print("Overfitting Diagnosis & Fix")
print("=" * 60)
print("\nSymptoms:")
print("  Train AUC = 0.995 ‚Üê Too high (memorization)")
print("  Val AUC   = 0.920 ‚Üê Much lower")
print("  Gap       = 0.075 ‚Üê Significant overfitting")
print("\nFixes (prioritized):")
print("  1. Early stopping (monitor val loss)")
print("  2. Increase regularization (reg_alpha, reg_lambda)")
print("  3. Reduce model complexity (num_leaves, max_depth)")
print("  4. Ensemble diversity (subsample, colsample)")
print("  5. Collect more data")
print("\nExpected result:")
print("  Train AUC ‚âà 0.93 (lower)")
print("  Val AUC   ‚âà 0.92 (similar)")
print("  Gap       ‚âà 0.01 (acceptable)")

**Solution 2**: Design hyperparameter search

**Strategy for 30-minute budget**:

```python
# Prioritized search strategy
param_grid = {
    # Tier 1: Most impactful (tune first)
    'classifier__n_estimators': [100, 200, 500],      # Capacity
    'classifier__learning_rate': [0.01, 0.05, 0.1],   # Step size
    
    # Tier 2: Regularization (tune second)
    'classifier__num_leaves': [31, 63],               # Complexity
    'classifier__min_child_samples': [20, 50],        # Overfitting
    
    # Tier 3: Fine-tuning (if time permits)
    'classifier__subsample': [0.8, 1.0],              # Stochasticity
    'classifier__colsample_bytree': [0.8, 1.0],       # Feature sampling
}

# Total combinations: 3 √ó 3 √ó 2 √ó 2 √ó 2 √ó 2 = 288

# Search method: Random Search (not Grid)
search = RandomizedSearchCV(
    pipeline,
    param_grid,
    n_iter=50,              # 50 random samples (not all 288)
    cv=3,                   # 3-fold (not 5) for speed
    scoring='roc_auc',
    n_jobs=-1,              # Parallelize
    random_state=42
)
# Expected: 50 iterations √ó 3 folds √ó ~5s/fit = ~12 minutes

# If still too slow:
# - Reduce n_iter to 30
# - Use 2-fold CV (risky but fast)
# - Subsample training data (use 50% for search)
```

**Why this strategy**:
1. **Random > Grid**: Explores more of parameter space
2. **3-fold CV**: Balance between reliability and speed
3. **Prioritize impactful params**: `n_estimators`, `learning_rate` affect performance most
4. **Coarse grid**: Don't search `learning_rate=[0.01, 0.02, 0.03, ...]` (too fine)
5. **Parallel execution**: Use all CPU cores (`n_jobs=-1`)

**Alternative for very tight budget (10 min)**:
- Start with sensible defaults
- Tune only `n_estimators` and `learning_rate` (2D grid)
- Use early stopping instead of full search

In [None]:
# Solution 2 (code template)
print("30-Minute Hyperparameter Search Strategy")
print("=" * 60)
print("\nSearch Configuration:")
print("  Method: RandomizedSearchCV")
print("  Iterations: 50")
print("  CV Folds: 3")
print("  Parallelization: n_jobs=-1")
print("\nParameter Priorities:")
print("  Tier 1 (must tune):")
print("    ‚Ä¢ n_estimators [100, 200, 500]")
print("    ‚Ä¢ learning_rate [0.01, 0.05, 0.1]")
print("  Tier 2 (important):")
print("    ‚Ä¢ num_leaves [31, 63]")
print("    ‚Ä¢ min_child_samples [20, 50]")
print("  Tier 3 (nice to have):")
print("    ‚Ä¢ subsample [0.8, 1.0]")
print("    ‚Ä¢ colsample_bytree [0.8, 1.0]")
print("\nEstimated time:")
print("  50 iters √ó 3 folds √ó 5s = ~12 minutes")
print("  Buffer: 18 minutes (search can finish early)")

**Solution 3**: Feature importance validation

**What it indicates**: üö® **DATA LEAKAGE**

**Why**: `transaction_id` should have zero predictive power (it's a unique identifier).
High importance means:
1. IDs are assigned non-randomly (e.g., sequential, with fraud clusters)
2. IDs correlate with time ‚Üí temporal leakage
3. IDs correlate with merchant/user ‚Üí proxy variable leakage

**Investigation steps**:

```python
# 1. Check if ID is truly unique
assert df['transaction_id'].nunique() == len(df), "IDs not unique!"

# 2. Check correlation with target
fraud_rate_by_id = df.groupby('transaction_id')['is_fraud'].mean()
# If IDs are unique, every group should have 0 or 1 fraud
# If not unique, check distribution

# 3. Check if ID encodes temporal information
df['id_numeric'] = df['transaction_id'].str.extract(r'(\d+)').astype(int)
correlation = df[['id_numeric', 'is_fraud']].corr().iloc[0, 1]
print(f"ID-Target correlation: {correlation:.3f}")
# If high correlation ‚Üí temporal leakage

# 4. Check if ID encodes merchant/user
# E.g., transaction_id = "merchant123_user456_timestamp"
# Merchant/user info leaks into ID

# 5. Plot fraud rate vs. ID
plt.scatter(df['id_numeric'], df['is_fraud'])
# If pattern visible ‚Üí leakage
```

**Fixes**:

1. **Drop ID feature entirely** (best fix):
   ```python
   feature_cols = [c for c in feature_cols if 'id' not in c.lower()]
   ```

2. **If ID must be included** (rare):
   - One-hot encode (if categorical)
   - Hash to fixed dimension (if high cardinality)
   - Extract only non-leaky parts (e.g., merchant_id OK, timestamp NOT OK)

3. **Verify fix**:
   - Retrain model without ID
   - Check performance (should drop slightly if ID was useful)
   - Check feature importance (ID should be gone)

**Lesson**: High importance on identifiers/timestamps ‚Üí investigate immediately for leakage.

In [None]:
# Solution 3 (code demonstration)
print("Feature Importance Validation: transaction_id")
print("=" * 60)
print("\nüö® RED FLAG: Unique identifier has high importance")
print("\nLikely causes:")
print("  1. IDs assigned non-randomly (sequential with fraud clusters)")
print("  2. IDs encode temporal information (timestamp leaked)")
print("  3. IDs encode merchant/user (proxy variables)")
print("\nInvestigation:")
print("  1. Check uniqueness: df['transaction_id'].nunique() == len(df)")
print("  2. Check correlation: df[['id_numeric', 'is_fraud']].corr()")
print("  3. Check pattern: plt.scatter(id, fraud_rate)")
print("  4. Parse ID structure: 'merchant_user_timestamp'")
print("\nFix:")
print("  ‚Ä¢ DROP transaction_id feature (99% of cases)")
print("  ‚Ä¢ If needed, extract only non-temporal components")
print("  ‚Ä¢ Verify: retrain, check importance again")
print("\nExpected outcome:")
print("  ‚Ä¢ Slight performance drop (1-2% AUC)")
print("  ‚Ä¢ Model now production-ready (no leakage)")

---

## ‚úÖ Notebook Complete!

**What you learned**:
1. ‚úÖ Train gradient boosting models (LightGBM)
2. ‚úÖ Understand key hyperparameters (n_estimators, learning_rate, regularization)
3. ‚úÖ Perform hyperparameter tuning with RandomizedSearchCV
4. ‚úÖ Analyze learning curves (diagnose over/underfitting)
5. ‚úÖ Interpret feature importance (explainability + leakage detection)

**Outputs saved**:
- `reports/04_learning_curve.png`
- `reports/04_feature_importance.png`
- `reports/04_hyperparameter_sensitivity.png`
- `reports/04_tuning_comparison.csv`
- `reports/04_experiment_log.json`

**Key takeaways**:
- Start with sensible defaults, then tune systematically
- RandomizedSearchCV > GridSearchCV for large search spaces
- Monitor train-val gap to detect overfitting
- Feature importance reveals both insights and leakage

**Next notebook**: `05_calibration_explainability_and_error_analysis.ipynb` ‚Äî Calibrate probability estimates, add explainability (SHAP), and analyze failure modes.