In [1]:
# ============================================================================
# CREDIT CARD FRAUD DETECTION - MODEL BUILDING NOTEBOOK
# Phase 4: Building ML Models with Cross-Validation
# STEP 16: Modeling Notebook
# ============================================================================

# Import all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (classification_report, roc_auc_score, confusion_matrix, 
                             roc_curve, precision_recall_curve, auc, make_scorer)
from imblearn.over_sampling import SMOTE, ADASYN

# Set plot style
sns.set_style('whitegrid')
%matplotlib inline

print("="*70)
print("CREDIT CARD FRAUD DETECTION - MODEL BUILDING")
print("="*70)
print("- Build and evaluate machine learning models")
print("- Use proper cross-validation methodology")
print("- Compare SMOTE vs ADASYN resampling techniques")
print("- Compare model performance")
print("- Select best model for deployment")
print("="*70)

# ============================================================================
# STEP 1: Load Raw Data
# ============================================================================
print("\n" + "="*70)
print("STEP 1: LOADING DATA")
print("="*70)

print("\n Loading raw data from CSV...")

df = pd.read_csv('../data/raw/creditcard.csv')

print(f"✓ Loaded {len(df):,} transactions")
print(f"  Features: {len(df.columns) - 1}")
print(f"  Fraud cases: {df['Class'].sum():,} ({df['Class'].mean()*100:.3f}%)")
print(f"  Normal cases: {(df['Class'] == 0).sum():,} ({(df['Class'] == 0).mean()*100:.3f}%)")
print(f"  Imbalance ratio: 1:{(df['Class'] == 0).sum() / df['Class'].sum():.1f}")

# Quick data quality check
print(f"\n Quick Data Quality Check:")
print(f"  Missing values: {df.isnull().sum().sum()}")
print(f"  Duplicates: {df.duplicated().sum()}")

# Remove duplicates if found
if df.duplicated().sum() > 0:
    print(f"\n Removing duplicates...")
    original_count = len(df)
    df = df.drop_duplicates()
    removed_count = original_count - len(df)
    
    print(f"✓ Removed {removed_count:,} duplicate transactions")
    print(f"  Original: {original_count:,} transactions")
    print(f"  After cleaning: {len(df):,} transactions")
    print(f"  Fraud cases remaining: {df['Class'].sum():,}")
else:
    print(f"\n✓ No duplicates found - data is clean!")

print(f"\n Data loaded and cleaned successfully!")

# ============================================================================
# STEP 2: Train-Test Split (BEFORE any preprocessing)
# ============================================================================
print("\n" + "="*70)
print("STEP 2: TRAIN-TEST SPLIT")
print("="*70)

print("\n Splitting data into train (80%) and test (20%)...")
print("   Using stratified split to preserve fraud ratio in both sets")

# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Stratified split - CRITICAL for imbalanced data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"\n✓ Split complete!")
print(f"\n Training Set:")
print(f"   Total samples: {len(X_train):,}")
print(f"   Fraud cases: {y_train.sum():,} ({y_train.mean()*100:.3f}%)")
print(f"   Normal cases: {(y_train == 0).sum():,} ({(y_train == 0).mean()*100:.3f}%)")

print(f"\n Test Set:")
print(f"   Total samples: {len(X_test):,}")
print(f"   Fraud cases: {y_test.sum():,} ({y_test.mean()*100:.3f}%)")
print(f"   Normal cases: {(y_test == 0).sum():,} ({(y_test == 0).mean()*100:.3f}%)")

print(f"\n Fraud ratio preserved: {y_train.mean():.6f} (train) vs {y_test.mean():.6f} (test)")

# ============================================================================
# STEP 3: Feature Scaling (Fit on Train, Transform Both)
# ============================================================================
print("\n" + "="*70)
print("STEP 3: FEATURE SCALING")
print("="*70)

print("\n Scaling Time and Amount features...")
print("   WHY: Time and Amount have different scales than V1-V28 (which are already PCA-transformed)")
print("   METHOD: StandardScaler (mean=0, std=1)")
print("   CRITICAL: Fit scaler on TRAINING data only to prevent data leakage!")

# Initialize scalers
scaler_time = StandardScaler()
scaler_amount = StandardScaler()

# Fit scalers on TRAINING data only
print("\n Fitting scalers on training data...")
scaler_time.fit(X_train[['Time']])
scaler_amount.fit(X_train[['Amount']])

print(f"✓ Time scaler fitted - mean: {scaler_time.mean_[0]:.2f}, std: {scaler_time.scale_[0]:.2f}")
print(f"✓ Amount scaler fitted - mean: {scaler_amount.mean_[0]:.2f}, std: {scaler_amount.scale_[0]:.2f}")

# Transform BOTH train and test using the SAME scalers
print("\n Transforming features...")

# Create copies to avoid modifying original data
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Apply transformations
X_train_scaled['Time_scaled'] = scaler_time.transform(X_train[['Time']])
X_train_scaled['Amount_scaled'] = scaler_amount.transform(X_train[['Amount']])

X_test_scaled['Time_scaled'] = scaler_time.transform(X_test[['Time']])
X_test_scaled['Amount_scaled'] = scaler_amount.transform(X_test[['Amount']])

# Drop original Time and Amount columns
X_train_scaled = X_train_scaled.drop(['Time', 'Amount'], axis=1)
X_test_scaled = X_test_scaled.drop(['Time', 'Amount'], axis=1)

# Reorder columns for consistency: V1-V28, then scaled features
feature_cols = [f'V{i}' for i in range(1, 29)] + ['Time_scaled', 'Amount_scaled']
X_train_scaled = X_train_scaled[feature_cols]
X_test_scaled = X_test_scaled[feature_cols]

print(f"✓ Features scaled successfully!")
print(f"  Final feature count: {X_train_scaled.shape[1]}")
print(f"  Feature order: V1-V28, Time_scaled, Amount_scaled")

# Verify scaling worked
print(f"\n Scaling Verification (Training Set):")
print(f"  Time_scaled - mean: {X_train_scaled['Time_scaled'].mean():.6f}, std: {X_train_scaled['Time_scaled'].std():.6f}")
print(f"  Amount_scaled - mean: {X_train_scaled['Amount_scaled'].mean():.6f}, std: {X_train_scaled['Amount_scaled'].std():.6f}")

# ============================================================================
# STEP 4: Apply BOTH SMOTE and ADASYN for Comparison
# ============================================================================
print("\n" + "="*70)
print("STEP 4: HANDLING CLASS IMBALANCE - SMOTE vs ADASYN")
print("="*70)

print("\n Creating balanced datasets with both methods...")
print("   WHY: Models struggle to learn from severely imbalanced data (599:1 ratio)")
print("   METHODS: SMOTE (uniform sampling) vs ADASYN (adaptive sampling)")
print("   CRITICAL: Apply to TRAINING data only - test must reflect real-world distribution!")

print(f"\n Original Training Set (Before Resampling):")
print(f"   Non-fraud: {(y_train == 0).sum():,}")
print(f"   Fraud: {(y_train == 1).sum():,}")
print(f"   Ratio: 1:{(y_train == 0).sum() / (y_train == 1).sum():.1f}")

# ============================================================================
# Apply SMOTE
# ============================================================================
print("\n" + "-"*70)
print("SMOTE (Synthetic Minority Over-sampling Technique)")
print("-"*70)
print("Creates synthetic samples uniformly between minority class neighbors")

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print(f"\n After SMOTE:")
print(f"   Non-fraud: {(y_train_smote == 0).sum():,}")
print(f"   Fraud: {(y_train_smote == 1).sum():,}")
print(f"   Ratio: 1:{(y_train_smote == 0).sum() / (y_train_smote == 1).sum():.1f}")
print(f"   Synthetic frauds created: {(y_train_smote == 1).sum() - y_train.sum():,}")

# ============================================================================
# Apply ADASYN
# ============================================================================
print("\n" + "-"*70)
print("ADASYN (Adaptive Synthetic Sampling)")
print("-"*70)
print("Creates MORE samples in harder-to-learn regions (adaptive approach)")

adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_scaled, y_train)

print(f"\n After ADASYN:")
print(f"   Non-fraud: {(y_train_adasyn == 0).sum():,}")
print(f"   Fraud: {(y_train_adasyn == 1).sum():,}")
print(f"   Ratio: 1:{(y_train_adasyn == 0).sum() / (y_train_adasyn == 1).sum():.1f}")
print(f"   Synthetic frauds created: {(y_train_adasyn == 1).sum() - y_train.sum():,}")

# ============================================================================
# Compare SMOTE vs ADASYN
# ============================================================================
print("\n" + "-"*70)
print("SMOTE vs ADASYN Comparison")
print("-"*70)

smote_samples = (y_train_smote == 1).sum()
adasyn_samples = (y_train_adasyn == 1).sum()
difference = abs(smote_samples - adasyn_samples)

print(f"\nTotal fraud samples (including synthetic):")
print(f"   SMOTE:  {smote_samples:,}")
print(f"   ADASYN: {adasyn_samples:,}")
print(f"   Difference: {difference:,} samples")

if adasyn_samples > smote_samples:
    print(f"\n ADASYN created {difference:,} MORE samples than SMOTE")
    print("   This suggests ADASYN identified more difficult boundary regions")
elif smote_samples > adasyn_samples:
    print(f"\n SMOTE created {difference:,} MORE samples than ADASYN")
    print("   This suggests uniform sampling covered more ground")
else:
    print("\n Both methods created the same number of samples")

# ============================================================================
# FINAL SUMMARY
# ============================================================================
print("\n" + "="*70)
print("DATA PREPARATION COMPLETE - READY FOR MODEL BUILDING!")
print("="*70)

print(f"\n Final Dataset Summary:")

print(f"\n   Original Training Set:")
print(f"      Total samples: {X_train_scaled.shape[0]:,}")
print(f"      Fraud samples: {y_train.sum():,}")
print(f"      Features: {X_train_scaled.shape[1]}")
print(f"      Class distribution: {y_train.mean()*100:.3f}% fraud (imbalanced)")

print(f"\n   SMOTE-Balanced Training Set:")
print(f"      Total samples: {X_train_smote.shape[0]:,}")
print(f"      Fraud samples: {y_train_smote.sum():,}")
print(f"      Features: {X_train_smote.shape[1]}")
print(f"      Class distribution: 50/50 (balanced)")

print(f"\n   ADASYN-Balanced Training Set:")
print(f"      Total samples: {X_train_adasyn.shape[0]:,}")
print(f"      Fraud samples: {y_train_adasyn.sum():,}")
print(f"      Features: {X_train_adasyn.shape[1]}")
print(f"      Class distribution: ~50/50 (balanced)")

print(f"\n   Test Set (Original Distribution):")
print(f"      Total samples: {X_test_scaled.shape[0]:,}")
print(f"      Fraud samples: {y_test.sum():,}")
print(f"      Features: {X_test_scaled.shape[1]}")
print(f"      Class distribution: {y_test.mean()*100:.3f}% fraud (real-world)")

print("\n KEY ACCOMPLISHMENTS:")
print("   ✓ Proper train-test split with stratification")
print("   ✓ No data leakage (scalers fitted on train only)")
print("   ✓ Two balanced training datasets created (SMOTE and ADASYN)")
print("   ✓ Test set preserves real-world distribution for valid evaluation")
print("   ✓ Ready to compare both resampling techniques in cross-validation!")

print("\n NEXT STEPS:")
print("   1. Build models with SMOTE-balanced data")
print("   2. Build models with ADASYN-balanced data")
print("   3. Compare which resampling technique performs better")
print("   4. Select best combination of model + resampling technique")

print("\n" + "="*70)

CREDIT CARD FRAUD DETECTION - MODEL BUILDING
- Build and evaluate machine learning models
- Use proper cross-validation methodology
- Compare SMOTE vs ADASYN resampling techniques
- Compare model performance
- Select best model for deployment

STEP 1: LOADING DATA

 Loading raw data from CSV...
✓ Loaded 284,807 transactions
  Features: 30
  Fraud cases: 492 (0.173%)
  Normal cases: 284,315 (99.827%)
  Imbalance ratio: 1:577.9

 Quick Data Quality Check:
  Missing values: 0
  Duplicates: 1081

 Removing duplicates...
✓ Removed 1,081 duplicate transactions
  Original: 284,807 transactions
  After cleaning: 283,726 transactions
  Fraud cases remaining: 473

 Data loaded and cleaned successfully!

STEP 2: TRAIN-TEST SPLIT

 Splitting data into train (80%) and test (20%)...
   Using stratified split to preserve fraud ratio in both sets

✓ Split complete!

 Training Set:
   Total samples: 226,980
   Fraud cases: 378 (0.167%)
   Normal cases: 226,602 (99.833%)

 Test Set:
   Total samples: 56

In [4]:
# ============================================================================
# STEP 17: BUILD LOGISTIC REGRESSION MODEL (WITH SMOTE AND ADASYN)
# ============================================================================

# Import additional required libraries
import pickle
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, ADASYN
import os

print("\n" + "="*70)
print("MODEL 1: LOGISTIC REGRESSION")
print("="*70)

print("\n WHAT WE'RE DOING:")
print("   - Building baseline Logistic Regression model")
print("   - Testing TWO resampling techniques: SMOTE vs ADASYN")
print("   - Using 5-fold cross-validation for performance estimation")
print("   - Applying resampling INSIDE each CV fold (proper methodology)")
print("   - Validating on REAL data only (no synthetic frauds in validation)")
print("   - SAVING model and results to disk for later use")

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# ============================================================================
# PART A: LOGISTIC REGRESSION WITH SMOTE
# ============================================================================
print("\n" + "="*70)
print("PART A: LOGISTIC REGRESSION + SMOTE")
print("="*70)

print("\n About SMOTE:")
print("   - Synthetic Minority Over-sampling Technique")
print("   - Creates synthetic samples by interpolating between existing frauds")
print("   - Places new samples along line segments between neighbors")
print("   - Good for general class imbalance")

# ----------------------------------------------------------------------------
# Create SMOTE Pipeline
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Creating SMOTE Pipeline")
print("-"*70)

print("\n  Building pipeline with 2 steps:")
print("   Step 1: SMOTE (balance training data)")
print("   Step 2: Logistic Regression (classification)")

# Create SMOTE pipeline
lr_smote_pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1))
])

print("\n+ SMOTE pipeline created successfully!")

# ----------------------------------------------------------------------------
# Cross-Validation with SMOTE
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Running 5-Fold Cross-Validation with SMOTE")
print("-"*70)

# Calculate dataset info dynamically (NO HARDCODING!)
total_samples = len(X_train_scaled)
total_frauds = int(y_train.sum())
samples_per_fold = total_samples // 5
frauds_per_fold = total_frauds // 5

print(f"\n Running 5-fold cross-validation...")
print(f"   Dataset: X_train_scaled ({total_samples:,} samples, {total_frauds} frauds)")
print(f"   Each fold: ~{samples_per_fold:,} samples (~{frauds_per_fold} real frauds)")

start_time = time.time()

# Perform 5-fold CV with SMOTE
cv_scores_smote = cross_val_score(
    lr_smote_pipeline,
    X_train_scaled,
    y_train,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

cv_time_smote = time.time() - start_time

# Display SMOTE results
print("\n" + "-"*70)
print("SMOTE Cross-Validation Results")
print("-"*70)

print(f"\n+ Cross-validation complete in {cv_time_smote:.2f} seconds")

print(f"\n Individual Fold Scores (SMOTE):")
for i, score in enumerate(cv_scores_smote, 1):
    print(f"   Fold {i}: AUC = {score:.4f}")

print(f"\n SMOTE Summary Statistics:")
print(f"   Mean CV AUC:  {cv_scores_smote.mean():.4f}")
print(f"   Std CV AUC:   {cv_scores_smote.std():.4f}")
print(f"   Min CV AUC:   {cv_scores_smote.min():.4f}")
print(f"   Max CV AUC:   {cv_scores_smote.max():.4f}")

# Calculate 95% confidence interval
confidence_interval_smote = 1.96 * cv_scores_smote.std()
print(f"\n 95% Confidence Interval:")
print(f"   {cv_scores_smote.mean():.4f} +/- {confidence_interval_smote:.4f}")
print(f"   Range: [{cv_scores_smote.mean() - confidence_interval_smote:.4f}, "
      f"{cv_scores_smote.mean() + confidence_interval_smote:.4f}]")

# ----------------------------------------------------------------------------
# Train Final SMOTE Model
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Training Final SMOTE Model")
print("-"*70)

print("\n Training final model with SMOTE on all training data...")

start_time = time.time()
lr_smote_pipeline.fit(X_train_scaled, y_train)
train_time_smote = time.time() - start_time

print(f"+ SMOTE model training complete in {train_time_smote:.2f} seconds")

# Store SMOTE results
lr_smote_cv_mean = cv_scores_smote.mean()
lr_smote_cv_std = cv_scores_smote.std()

print(f"\n SMOTE Model Summary:")
print(f"   Expected AUC: {lr_smote_cv_mean:.4f} +/- {lr_smote_cv_std:.4f}")
print(f"   Stability: {'Excellent' if lr_smote_cv_std < 0.01 else 'Good' if lr_smote_cv_std < 0.02 else 'Moderate'}")


# ============================================================================
# PART B: LOGISTIC REGRESSION WITH ADASYN
# ============================================================================
print("\n" + "="*70)
print("PART B: LOGISTIC REGRESSION + ADASYN")
print("="*70)

print("\n About ADASYN:")
print("   - Adaptive Synthetic Sampling")
print("   - Focuses on harder-to-learn minority samples")
print("   - Creates MORE synthetics near decision boundary (harder cases)")
print("   - Creates FEWER synthetics in easy regions")
print("   - Better for complex, varied fraud patterns")

# ----------------------------------------------------------------------------
# Create ADASYN Pipeline
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Creating ADASYN Pipeline")
print("-"*70)

print("\n  Building pipeline with 2 steps:")
print("   Step 1: ADASYN (adaptive balance training data)")
print("   Step 2: Logistic Regression (classification)")

# Create ADASYN pipeline
lr_adasyn_pipeline = ImbPipeline([
    ('adasyn', ADASYN(random_state=42)),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1))
])

print("\n+ ADASYN pipeline created successfully!")

# ----------------------------------------------------------------------------
# Cross-Validation with ADASYN
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Running 5-Fold Cross-Validation with ADASYN")
print("-"*70)

print(f"\n Running 5-fold cross-validation...")
print(f"   Dataset: X_train_scaled ({total_samples:,} samples, {total_frauds} frauds)")
print(f"   ADASYN will adaptively generate synthetics based on difficulty")

start_time = time.time()

# Perform 5-fold CV with ADASYN
cv_scores_adasyn = cross_val_score(
    lr_adasyn_pipeline,
    X_train_scaled,
    y_train,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

cv_time_adasyn = time.time() - start_time

# Display ADASYN results
print("\n" + "-"*70)
print("ADASYN Cross-Validation Results")
print("-"*70)

print(f"\n+ Cross-validation complete in {cv_time_adasyn:.2f} seconds")

print(f"\n Individual Fold Scores (ADASYN):")
for i, score in enumerate(cv_scores_adasyn, 1):
    print(f"   Fold {i}: AUC = {score:.4f}")

print(f"\n ADASYN Summary Statistics:")
print(f"   Mean CV AUC:  {cv_scores_adasyn.mean():.4f}")
print(f"   Std CV AUC:   {cv_scores_adasyn.std():.4f}")
print(f"   Min CV AUC:   {cv_scores_adasyn.min():.4f}")
print(f"   Max CV AUC:   {cv_scores_adasyn.max():.4f}")

# Calculate 95% confidence interval
confidence_interval_adasyn = 1.96 * cv_scores_adasyn.std()
print(f"\n 95% Confidence Interval:")
print(f"   {cv_scores_adasyn.mean():.4f} +/- {confidence_interval_adasyn:.4f}")
print(f"   Range: [{cv_scores_adasyn.mean() - confidence_interval_adasyn:.4f}, "
      f"{cv_scores_adasyn.mean() + confidence_interval_adasyn:.4f}]")

# ----------------------------------------------------------------------------
# Train Final ADASYN Model
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Training Final ADASYN Model")
print("-"*70)

print("\n Training final model with ADASYN on all training data...")

start_time = time.time()
lr_adasyn_pipeline.fit(X_train_scaled, y_train)
train_time_adasyn = time.time() - start_time

print(f"+ ADASYN model training complete in {train_time_adasyn:.2f} seconds")

# Store ADASYN results
lr_adasyn_cv_mean = cv_scores_adasyn.mean()
lr_adasyn_cv_std = cv_scores_adasyn.std()

print(f"\n ADASYN Model Summary:")
print(f"   Expected AUC: {lr_adasyn_cv_mean:.4f} +/- {lr_adasyn_cv_std:.4f}")
print(f"   Stability: {'Excellent' if lr_adasyn_cv_std < 0.01 else 'Good' if lr_adasyn_cv_std < 0.02 else 'Moderate'}")


# ============================================================================
# COMPARISON: SMOTE vs ADASYN
# ============================================================================
print("\n" + "="*70)
print("SMOTE vs ADASYN COMPARISON")
print("="*70)

print(f"\n Performance Comparison:")
print(f"   SMOTE  Mean AUC: {lr_smote_cv_mean:.4f} (+/-{lr_smote_cv_std:.4f})")
print(f"   ADASYN Mean AUC: {lr_adasyn_cv_mean:.4f} (+/-{lr_adasyn_cv_std:.4f})")

# Calculate difference
auc_difference = lr_adasyn_cv_mean - lr_smote_cv_mean
print(f"\n   Difference: {abs(auc_difference):.4f} ({'ADASYN better' if auc_difference > 0 else 'SMOTE better'})")

# Determine winner
if abs(auc_difference) < 0.001:
    winner = "TIE - Performance essentially identical"
    recommendation = "Either technique is suitable - choose based on training time"
elif auc_difference > 0:
    winner = "ADASYN WINS"
    recommendation = "ADASYN provides better fraud detection for this dataset"
else:
    winner = "SMOTE WINS"
    recommendation = "SMOTE provides better fraud detection for this dataset"

print(f"\n Winner: {winner}")
print(f"\n Recommendation: {recommendation}")

print(f"\n  Training Time Comparison:")
print(f"   SMOTE:  {cv_time_smote:.2f} seconds")
print(f"   ADASYN: {cv_time_adasyn:.2f} seconds")
print(f"   Difference: {abs(cv_time_smote - cv_time_adasyn):.2f} seconds ({'ADASYN slower' if cv_time_adasyn > cv_time_smote else 'SMOTE slower'})")

print(f"\n Stability Comparison:")
print(f"   SMOTE  Std: {lr_smote_cv_std:.4f} ({'More stable' if lr_smote_cv_std < lr_adasyn_cv_std else 'Less stable'})")
print(f"   ADASYN Std: {lr_adasyn_cv_std:.4f} ({'More stable' if lr_adasyn_cv_std < lr_smote_cv_std else 'Less stable'})")

# Statistical significance test (simple check)
print(f"\n Statistical Significance:")
smote_lower = lr_smote_cv_mean - confidence_interval_smote
smote_upper = lr_smote_cv_mean + confidence_interval_smote
adasyn_lower = lr_adasyn_cv_mean - confidence_interval_adasyn
adasyn_upper = lr_adasyn_cv_mean + confidence_interval_adasyn

if (smote_lower <= adasyn_upper) and (adasyn_lower <= smote_upper):
    print(f"   Confidence intervals overlap")
    print(f"   Difference may not be statistically significant")
    print(f"   Both methods perform similarly on this dataset")
else:
    print(f"   + Confidence intervals don't overlap")
    print(f"   Difference is likely statistically significant")
    print(f"   {winner} is meaningfully better")

# Visualization of comparison
print("\n" + "-"*70)
print("Visual Comparison")
print("-"*70)

print("\n Mean AUC Scores:")
print(f"   SMOTE:  {'█' * int(lr_smote_cv_mean * 50)} {lr_smote_cv_mean:.4f}")
print(f"   ADASYN: {'█' * int(lr_adasyn_cv_mean * 50)} {lr_adasyn_cv_mean:.4f}")

print(f"\n Consistency (Lower is Better):")
print(f"   SMOTE:  {'█' * int(lr_smote_cv_std * 1000)} {lr_smote_cv_std:.4f}")
print(f"   ADASYN: {'█' * int(lr_adasyn_cv_std * 1000)} {lr_adasyn_cv_std:.4f}")


# ============================================================================
# SELECT BEST MODEL FOR FINAL USE
# ============================================================================
print("\n" + "="*70)
print("SELECTING BEST RESAMPLING TECHNIQUE")
print("="*70)

# Choose best based on mean AUC (with tie-breaker on stability)
if abs(auc_difference) < 0.001:
    # Performance is essentially the same, choose more stable
    if lr_smote_cv_std < lr_adasyn_cv_std:
        best_pipeline = lr_smote_pipeline
        best_method = "SMOTE"
        best_cv_mean = lr_smote_cv_mean
        best_cv_std = lr_smote_cv_std
        best_cv_scores = cv_scores_smote
        reason = "Similar performance, but SMOTE is more stable"
    else:
        best_pipeline = lr_adasyn_pipeline
        best_method = "ADASYN"
        best_cv_mean = lr_adasyn_cv_mean
        best_cv_std = lr_adasyn_cv_std
        best_cv_scores = cv_scores_adasyn
        reason = "Similar performance, but ADASYN is more stable"
else:
    # Clear winner based on AUC
    if lr_smote_cv_mean > lr_adasyn_cv_mean:
        best_pipeline = lr_smote_pipeline
        best_method = "SMOTE"
        best_cv_mean = lr_smote_cv_mean
        best_cv_std = lr_smote_cv_std
        best_cv_scores = cv_scores_smote
        reason = f"Higher mean AUC ({lr_smote_cv_mean:.4f} vs {lr_adasyn_cv_mean:.4f})"
    else:
        best_pipeline = lr_adasyn_pipeline
        best_method = "ADASYN"
        best_cv_mean = lr_adasyn_cv_mean
        best_cv_std = lr_adasyn_cv_std
        best_cv_scores = cv_scores_adasyn
        reason = f"Higher mean AUC ({lr_adasyn_cv_mean:.4f} vs {lr_smote_cv_mean:.4f})"

print(f"\n Selected Method: {best_method}")
print(f"   Reason: {reason}")
print(f"   Performance: {best_cv_mean:.4f} +/- {best_cv_std:.4f}")

print(f"\n Storing best model for later steps:")
print(f"   Model: Logistic Regression + {best_method}")
print(f"   Expected AUC: {best_cv_mean:.4f}")

# Store the best results for comparison in Step 21
lr_final_pipeline = best_pipeline
lr_cv_mean = best_cv_mean
lr_cv_std = best_cv_std
lr_cv_scores = best_cv_scores
lr_best_method = best_method


# ============================================================================
# HYPERPARAMETER TUNING FOR BEST MODEL
# ============================================================================
print("\n" + "="*70)
print("HYPERPARAMETER TUNING - LOGISTIC REGRESSION")
print("="*70)

print(f"\n WHAT WE'RE DOING:")
print(f"   - Fine-tuning the best model ({best_method} + Logistic Regression)")
print(f"   - Testing different hyperparameter combinations")
print(f"   - Using GridSearchCV with cross-validation")
print(f"   - Goal: Improve beyond baseline {best_cv_mean:.4f} AUC")

# ----------------------------------------------------------------------------
# Define Hyperparameter Grid
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Defining Hyperparameter Search Space")
print("-"*70)

print("\n Hyperparameters to tune:")
print("   1. C (Regularization strength)")
print("      - Controls model complexity")
print("      - Smaller C = stronger regularization (simpler model)")
print("      - Larger C = weaker regularization (more complex model)")
print("\n   2. penalty (Regularization type)")
print("      - 'l2': Ridge regularization (default)")
print("      - 'l1': Lasso regularization (feature selection)")
print("\n   3. solver (Optimization algorithm)")
print("      - Different algorithms for finding best coefficients")
print("      - Some solvers work better with certain penalties")

# Define parameter grid
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']
}

print("\n Parameter Grid:")
print(f"   C values: {param_grid['classifier__C']}")
print(f"   Penalties: {param_grid['classifier__penalty']}")
print(f"   Solvers: {param_grid['classifier__solver']}")
print(f"\n   Total combinations: {len(param_grid['classifier__C']) * len(param_grid['classifier__penalty']) * len(param_grid['classifier__solver'])}")
print(f"   With 5-fold CV: {len(param_grid['classifier__C']) * len(param_grid['classifier__penalty']) * len(param_grid['classifier__solver']) * 5} model fits")

# ----------------------------------------------------------------------------
# Run Grid Search
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Running Grid Search with Cross-Validation")
print("-"*70)

print("\n This will take a few minutes...")
print("   Each combination is tested with 5-fold CV")
print("   Progress will be shown below")

# Create GridSearchCV
grid_search = GridSearchCV(
    estimator=lr_final_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# Run grid search
start_time = time.time()
grid_search.fit(X_train_scaled, y_train)
grid_time = time.time() - start_time

# ----------------------------------------------------------------------------
# Display Results
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Grid Search Results")
print("-"*70)

print(f"\n+ Grid search complete in {grid_time:.2f} seconds ({grid_time/60:.1f} minutes)")

print(f"\n Best Parameters Found:")
for param_name, param_value in grid_search.best_params_.items():
    # Remove 'classifier__' prefix for cleaner display
    clean_name = param_name.replace('classifier__', '')
    print(f"   {clean_name}: {param_value}")

print(f"\n Performance Comparison:")
print(f"   Baseline AUC:  {best_cv_mean:.4f} (+/-{best_cv_std:.4f})")
print(f"   Tuned AUC:     {grid_search.best_score_:.4f} (+/-{grid_search.cv_results_['std_test_score'][grid_search.best_index_]:.4f})")

# Calculate improvement
improvement = grid_search.best_score_ - best_cv_mean
improvement_pct = (improvement / best_cv_mean) * 100

print(f"\n Improvement:")
if improvement > 0:
    print(f"   +{improvement:.4f} AUC ({improvement_pct:+.2f}%)")
    print(f"   + Hyperparameter tuning improved performance!")
elif improvement < -0.001:
    print(f"   {improvement:.4f} AUC ({improvement_pct:.2f}%)")
    print(f"   Tuned model performed slightly worse")
    print(f"   -> Baseline model was already well-optimized")
else:
    print(f"   ~{improvement:.4f} AUC (essentially no change)")
    print(f"   -> Baseline hyperparameters were already optimal")

# ----------------------------------------------------------------------------
# Top 5 Parameter Combinations
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Top 5 Hyperparameter Combinations")
print("-"*70)

# Get results sorted by score
results_df = pd.DataFrame(grid_search.cv_results_)
results_df = results_df.sort_values('rank_test_score')

print("\n Best performing combinations:\n")
for i in range(min(5, len(results_df))):
    row = results_df.iloc[i]
    print(f"   Rank {i+1}:")
    print(f"      C={row['param_classifier__C']}, "
          f"penalty={row['param_classifier__penalty']}, "
          f"solver={row['param_classifier__solver']}")
    print(f"      Mean AUC: {row['mean_test_score']:.4f} (+/-{row['std_test_score']:.4f})")
    print()

# ----------------------------------------------------------------------------
# Store Tuned Model
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Storing Tuned Model")
print("-"*70)

# Store the tuned model
lr_tuned_pipeline = grid_search.best_estimator_
lr_tuned_cv_mean = grid_search.best_score_
lr_tuned_cv_std = grid_search.cv_results_['std_test_score'][grid_search.best_index_]
lr_best_params = grid_search.best_params_

print(f"\n+ Tuned model stored for final evaluation")
print(f"   Best hyperparameters saved")
print(f"   Expected AUC: {lr_tuned_cv_mean:.4f}")

# ----------------------------------------------------------------------------
# Decision: Use Baseline or Tuned?
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Model Selection Decision")
print("-"*70)

# Decide whether to use tuned or baseline model
if improvement > 0.0001:  # Meaningful improvement
    print(f"\n+ Using TUNED model for final evaluation")
    print(f"   Reason: Tuning improved performance by {improvement:.4f}")
    lr_final_model = lr_tuned_pipeline
    lr_final_cv_mean = lr_tuned_cv_mean
    lr_final_cv_std = lr_tuned_cv_std
    model_version = "Tuned"
else:
    print(f"\n+ Using BASELINE model for final evaluation")
    print(f"   Reason: Tuning did not provide meaningful improvement")
    print(f"   Baseline model was already well-optimized")
    lr_final_model = lr_final_pipeline
    lr_final_cv_mean = best_cv_mean
    lr_final_cv_std = best_cv_std
    model_version = "Baseline"

print(f"\n Final Logistic Regression Model:")
print(f"   Version: {model_version}")
print(f"   Resampling: {best_method}")
print(f"   Expected AUC: {lr_final_cv_mean:.4f} (+/-{lr_final_cv_std:.4f})")


# ============================================================================
# SAVE MODEL AND RESULTS TO DISK
# ============================================================================
print("\n" + "="*70)
print("SAVING MODEL AND RESULTS")
print("="*70)

print("\n Saving to ../models/ directory...")

# Save the final model
model_path = '../models/logistic_regression_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(lr_final_model, f)
print(f"   + Model saved: {model_path}")

# Save all results and metadata
results_data = {
    'model_name': 'Logistic Regression',
    'resampling_method': best_method,
    'model_version': model_version,
    'cv_mean': lr_final_cv_mean,
    'cv_std': lr_final_cv_std,
    'cv_scores': lr_cv_scores,
    'best_params': lr_best_params if model_version == "Tuned" else None,
    'training_samples': total_samples,
    'training_frauds': total_frauds,
    'timestamp': pd.Timestamp.now()
}

results_path = '../models/logistic_regression_results.pkl'
with open(results_path, 'wb') as f:
    pickle.dump(results_data, f)
print(f"   + Results saved: {results_path}")

print(f"\n+ All files saved successfully!")
print(f"   Model can now be loaded in Phase 5 for comparison")


# ============================================================================
# FINAL SUMMARY
# ============================================================================
print("\n" + "="*70)
print("STEP 17 COMPLETE - KEY ACCOMPLISHMENTS")
print("="*70)

print("\n What We Accomplished:")
print("   + Built baseline Logistic Regression with TWO resampling techniques")
print("   + Compared SMOTE vs ADASYN performance")
print("   + Used proper CV methodology (resampling inside folds)")
print("   + Validated on 100% real data (no synthetic contamination)")
print("   + Selected best resampling technique based on performance")
print("   + Performed hyperparameter tuning with GridSearchCV")
print("   + Trained final optimized model on all available training data")
print("   + SAVED model and results to disk")

print(f"\n Final Results:")
print(f"   Best Method: {best_method}")
print(f"   Model Version: {model_version}")
print(f"   Cross-Validation AUC: {lr_final_cv_mean:.4f} (+/-{lr_final_cv_std:.4f})")
print(f"   Model Stability: {'Excellent' if lr_final_cv_std < 0.01 else 'Good' if lr_final_cv_std < 0.02 else 'Moderate'}")

print("\n Saved Files:")
print(f"   - {model_path}")
print(f"   - {results_path}")

print("\n Next Steps:")
print("   -> Step 18: Build Random Forest Model")
print("   -> Step 19: Build XGBoost Model")
print("   -> Step 20: Build Neural Network (Deep Learning)")
print("   -> Step 21: Compare all models and select winner")

print("\n Important Notes:")
print("   - Test set remains UNTOUCHED for final evaluation in Phase 5")
print("   - Model saved and can be loaded anytime (no need to retrain)")
print("   - Can restart PC without losing progress")
print(f"   - {best_method} + {model_version} model ready for Phase 5")

print("\n" + "="*70)


MODEL 1: LOGISTIC REGRESSION

 WHAT WE'RE DOING:
   - Building baseline Logistic Regression model
   - Testing TWO resampling techniques: SMOTE vs ADASYN
   - Using 5-fold cross-validation for performance estimation
   - Applying resampling INSIDE each CV fold (proper methodology)
   - Validating on REAL data only (no synthetic frauds in validation)
   - SAVING model and results to disk for later use

PART A: LOGISTIC REGRESSION + SMOTE

 About SMOTE:
   - Synthetic Minority Over-sampling Technique
   - Creates synthetic samples by interpolating between existing frauds
   - Places new samples along line segments between neighbors
   - Good for general class imbalance

----------------------------------------------------------------------
Creating SMOTE Pipeline
----------------------------------------------------------------------

  Building pipeline with 2 steps:
   Step 1: SMOTE (balance training data)
   Step 2: Logistic Regression (classification)

+ SMOTE pipeline created succes

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.5s finished



----------------------------------------------------------------------
SMOTE Cross-Validation Results
----------------------------------------------------------------------

+ Cross-validation complete in 3.76 seconds

 Individual Fold Scores (SMOTE):
   Fold 1: AUC = 0.9699
   Fold 2: AUC = 0.9826
   Fold 3: AUC = 0.9987
   Fold 4: AUC = 0.9829
   Fold 5: AUC = 0.9780

 SMOTE Summary Statistics:
   Mean CV AUC:  0.9824
   Std CV AUC:   0.0094
   Min CV AUC:   0.9699
   Max CV AUC:   0.9987

 95% Confidence Interval:
   0.9824 +/- 0.0184
   Range: [0.9640, 1.0008]

----------------------------------------------------------------------
Training Final SMOTE Model
----------------------------------------------------------------------

 Training final model with SMOTE on all training data...
+ SMOTE model training complete in 2.23 seconds

 SMOTE Model Summary:
   Expected AUC: 0.9824 +/- 0.0094
   Stability: Excellent

PART B: LOGISTIC REGRESSION + ADASYN

 About ADASYN:
   - Adaptive Sy

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.5s finished



----------------------------------------------------------------------
ADASYN Cross-Validation Results
----------------------------------------------------------------------

+ Cross-validation complete in 3.77 seconds

 Individual Fold Scores (ADASYN):
   Fold 1: AUC = 0.9696
   Fold 2: AUC = 0.9788
   Fold 3: AUC = 0.9987
   Fold 4: AUC = 0.9803
   Fold 5: AUC = 0.9798

 ADASYN Summary Statistics:
   Mean CV AUC:  0.9815
   Std CV AUC:   0.0095
   Min CV AUC:   0.9696
   Max CV AUC:   0.9987

 95% Confidence Interval:
   0.9815 +/- 0.0186
   Range: [0.9629, 1.0000]

----------------------------------------------------------------------
Training Final ADASYN Model
----------------------------------------------------------------------

 Training final model with ADASYN on all training data...
+ ADASYN model training complete in 2.23 seconds

 ADASYN Model Summary:
   Expected AUC: 0.9815 +/- 0.0095
   Stability: Excellent

SMOTE vs ADASYN COMPARISON

 Performance Comparison:
   SMOTE 

In [5]:
# ============================================================================
# STEP 18: BUILD RANDOM FOREST MODEL (WITH SMOTE AND ADASYN)
# ============================================================================

# Import additional required libraries
import pickle
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, ADASYN
import os

print("\n" + "="*70)
print("MODEL 2: RANDOM FOREST")
print("="*70)

print("\n WHAT WE'RE DOING:")
print("   - Building Random Forest classifier (ensemble of 100 decision trees)")
print("   - Testing TWO resampling techniques: SMOTE vs ADASYN")
print("   - Using 5-fold cross-validation for performance estimation")
print("   - Applying resampling INSIDE each CV fold (proper methodology)")
print("   - Validating on REAL data only (no synthetic frauds in validation)")
print("   - SAVING model and results to disk for later use")

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# ============================================================================
# PART A: RANDOM FOREST WITH SMOTE
# ============================================================================
print("\n" + "="*70)
print("PART A: RANDOM FOREST + SMOTE")
print("="*70)

print("\n About Random Forest:")
print("   - Ensemble of 100 decision trees voting together")
print("   - Each tree trained on random subset of data")
print("   - Each tree focuses on different feature combinations")
print("   - Final prediction = majority vote from all trees")
print("   - More powerful than Logistic Regression for complex patterns")

print("\n About SMOTE:")
print("   - Synthetic Minority Over-sampling Technique")
print("   - Creates synthetic samples by interpolating between existing frauds")
print("   - Places new samples along line segments between neighbors")
print("   - Good for general class imbalance")

# ----------------------------------------------------------------------------
# Create SMOTE Pipeline
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Creating SMOTE Pipeline")
print("-"*70)

print("\n Building pipeline with 2 steps:")
print("   Step 1: SMOTE (balance training data)")
print("   Step 2: Random Forest (classification with 100 trees)")

# Create SMOTE pipeline
rf_smote_pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1
    ))
])

print("\n+ SMOTE pipeline created successfully!")

# ----------------------------------------------------------------------------
# Cross-Validation with SMOTE
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Running 5-Fold Cross-Validation with SMOTE")
print("-"*70)

# Calculate dataset info dynamically
total_samples = len(X_train_scaled)
total_frauds = int(y_train.sum())
samples_per_fold = total_samples // 5
frauds_per_fold = total_frauds // 5

print(f"\n Running 5-fold cross-validation...")
print(f"   Dataset: X_train_scaled ({total_samples:,} samples, {total_frauds} frauds)")
print(f"   Each fold: ~{samples_per_fold:,} samples (~{frauds_per_fold} real frauds)")
print(f"   (This will take 3-5 minutes due to Random Forest complexity)")

start_time = time.time()

# Perform 5-fold CV with SMOTE
cv_scores_smote = cross_val_score(
    rf_smote_pipeline,
    X_train_scaled,
    y_train,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

cv_time_smote = time.time() - start_time

# Display SMOTE results
print("\n" + "-"*70)
print("SMOTE Cross-Validation Results")
print("-"*70)

print(f"\n+ Cross-validation complete in {cv_time_smote:.2f} seconds ({cv_time_smote/60:.1f} minutes)")

print(f"\n Individual Fold Scores (SMOTE):")
for i, score in enumerate(cv_scores_smote, 1):
    print(f"   Fold {i}: AUC = {score:.4f}")

print(f"\n SMOTE Summary Statistics:")
print(f"   Mean CV AUC:  {cv_scores_smote.mean():.4f}")
print(f"   Std CV AUC:   {cv_scores_smote.std():.4f}")
print(f"   Min CV AUC:   {cv_scores_smote.min():.4f}")
print(f"   Max CV AUC:   {cv_scores_smote.max():.4f}")

# Calculate 95% confidence interval
confidence_interval_smote = 1.96 * cv_scores_smote.std()
print(f"\n 95% Confidence Interval:")
print(f"   {cv_scores_smote.mean():.4f} +/- {confidence_interval_smote:.4f}")
print(f"   Range: [{cv_scores_smote.mean() - confidence_interval_smote:.4f}, "
      f"{cv_scores_smote.mean() + confidence_interval_smote:.4f}]")

# ----------------------------------------------------------------------------
# Train Final SMOTE Model
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Training Final SMOTE Model")
print("-"*70)

print("\n Training final model with SMOTE on all training data...")

start_time = time.time()
rf_smote_pipeline.fit(X_train_scaled, y_train)
train_time_smote = time.time() - start_time

print(f"+ SMOTE model training complete in {train_time_smote:.2f} seconds")

# Store SMOTE results
rf_smote_cv_mean = cv_scores_smote.mean()
rf_smote_cv_std = cv_scores_smote.std()

print(f"\n SMOTE Model Summary:")
print(f"   Expected AUC: {rf_smote_cv_mean:.4f} +/- {rf_smote_cv_std:.4f}")
print(f"   Stability: {'Excellent' if rf_smote_cv_std < 0.01 else 'Good' if rf_smote_cv_std < 0.02 else 'Moderate'}")


# ============================================================================
# PART B: RANDOM FOREST WITH ADASYN
# ============================================================================
print("\n" + "="*70)
print("PART B: RANDOM FOREST + ADASYN")
print("="*70)

print("\n About ADASYN:")
print("   - Adaptive Synthetic Sampling")
print("   - Focuses on harder-to-learn minority samples")
print("   - Creates MORE synthetics near decision boundary (harder cases)")
print("   - Creates FEWER synthetics in easy regions")
print("   - Better for complex, varied fraud patterns")

# ----------------------------------------------------------------------------
# Create ADASYN Pipeline
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Creating ADASYN Pipeline")
print("-"*70)

print("\n Building pipeline with 2 steps:")
print("   Step 1: ADASYN (adaptive balance training data)")
print("   Step 2: Random Forest (classification with 100 trees)")

# Create ADASYN pipeline
rf_adasyn_pipeline = ImbPipeline([
    ('adasyn', ADASYN(random_state=42)),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1
    ))
])

print("\n+ ADASYN pipeline created successfully!")

# ----------------------------------------------------------------------------
# Cross-Validation with ADASYN
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Running 5-Fold Cross-Validation with ADASYN")
print("-"*70)

print(f"\n Running 5-fold cross-validation...")
print(f"   Dataset: X_train_scaled ({total_samples:,} samples, {total_frauds} frauds)")
print(f"   ADASYN will adaptively generate synthetics based on difficulty")
print(f"   (This will take 3-5 minutes due to Random Forest complexity)")

start_time = time.time()

# Perform 5-fold CV with ADASYN
cv_scores_adasyn = cross_val_score(
    rf_adasyn_pipeline,
    X_train_scaled,
    y_train,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

cv_time_adasyn = time.time() - start_time

# Display ADASYN results
print("\n" + "-"*70)
print("ADASYN Cross-Validation Results")
print("-"*70)

print(f"\n+ Cross-validation complete in {cv_time_adasyn:.2f} seconds ({cv_time_adasyn/60:.1f} minutes)")

print(f"\n Individual Fold Scores (ADASYN):")
for i, score in enumerate(cv_scores_adasyn, 1):
    print(f"   Fold {i}: AUC = {score:.4f}")

print(f"\n ADASYN Summary Statistics:")
print(f"   Mean CV AUC:  {cv_scores_adasyn.mean():.4f}")
print(f"   Std CV AUC:   {cv_scores_adasyn.std():.4f}")
print(f"   Min CV AUC:   {cv_scores_adasyn.min():.4f}")
print(f"   Max CV AUC:   {cv_scores_adasyn.max():.4f}")

# Calculate 95% confidence interval
confidence_interval_adasyn = 1.96 * cv_scores_adasyn.std()
print(f"\n 95% Confidence Interval:")
print(f"   {cv_scores_adasyn.mean():.4f} +/- {confidence_interval_adasyn:.4f}")
print(f"   Range: [{cv_scores_adasyn.mean() - confidence_interval_adasyn:.4f}, "
      f"{cv_scores_adasyn.mean() + confidence_interval_adasyn:.4f}]")

# ----------------------------------------------------------------------------
# Train Final ADASYN Model
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Training Final ADASYN Model")
print("-"*70)

print("\n Training final model with ADASYN on all training data...")

start_time = time.time()
rf_adasyn_pipeline.fit(X_train_scaled, y_train)
train_time_adasyn = time.time() - start_time

print(f"+ ADASYN model training complete in {train_time_adasyn:.2f} seconds")

# Store ADASYN results
rf_adasyn_cv_mean = cv_scores_adasyn.mean()
rf_adasyn_cv_std = cv_scores_adasyn.std()

print(f"\n ADASYN Model Summary:")
print(f"   Expected AUC: {rf_adasyn_cv_mean:.4f} +/- {rf_adasyn_cv_std:.4f}")
print(f"   Stability: {'Excellent' if rf_adasyn_cv_std < 0.01 else 'Good' if rf_adasyn_cv_std < 0.02 else 'Moderate'}")


# ============================================================================
# COMPARISON: SMOTE vs ADASYN
# ============================================================================
print("\n" + "="*70)
print("SMOTE vs ADASYN COMPARISON")
print("="*70)

print(f"\n Performance Comparison:")
print(f"   SMOTE  Mean AUC: {rf_smote_cv_mean:.4f} (+/-{rf_smote_cv_std:.4f})")
print(f"   ADASYN Mean AUC: {rf_adasyn_cv_mean:.4f} (+/-{rf_adasyn_cv_std:.4f})")

# Calculate difference
auc_difference = rf_adasyn_cv_mean - rf_smote_cv_mean
print(f"\n   Difference: {abs(auc_difference):.4f} ({'ADASYN better' if auc_difference > 0 else 'SMOTE better'})")

# Determine winner
if abs(auc_difference) < 0.001:
    winner = "TIE - Performance essentially identical"
    recommendation = "Either technique is suitable - choose based on training time"
elif auc_difference > 0:
    winner = "ADASYN WINS"
    recommendation = "ADASYN provides better fraud detection for this dataset"
else:
    winner = "SMOTE WINS"
    recommendation = "SMOTE provides better fraud detection for this dataset"

print(f"\n Winner: {winner}")
print(f"\n Recommendation: {recommendation}")

print(f"\n Training Time Comparison:")
print(f"   SMOTE:  {cv_time_smote:.2f} seconds ({cv_time_smote/60:.1f} minutes)")
print(f"   ADASYN: {cv_time_adasyn:.2f} seconds ({cv_time_adasyn/60:.1f} minutes)")
print(f"   Difference: {abs(cv_time_smote - cv_time_adasyn):.2f} seconds ({'ADASYN slower' if cv_time_adasyn > cv_time_smote else 'SMOTE slower'})")

print(f"\n Stability Comparison:")
print(f"   SMOTE  Std: {rf_smote_cv_std:.4f} ({'More stable' if rf_smote_cv_std < rf_adasyn_cv_std else 'Less stable'})")
print(f"   ADASYN Std: {rf_adasyn_cv_std:.4f} ({'More stable' if rf_adasyn_cv_std < rf_smote_cv_std else 'Less stable'})")

# Statistical significance test
print(f"\n Statistical Significance:")
smote_lower = rf_smote_cv_mean - confidence_interval_smote
smote_upper = rf_smote_cv_mean + confidence_interval_smote
adasyn_lower = rf_adasyn_cv_mean - confidence_interval_adasyn
adasyn_upper = rf_adasyn_cv_mean + confidence_interval_adasyn

if (smote_lower <= adasyn_upper) and (adasyn_lower <= smote_upper):
    print(f"   Confidence intervals overlap")
    print(f"   Difference may not be statistically significant")
    print(f"   Both methods perform similarly on this dataset")
else:
    print(f"   + Confidence intervals don't overlap")
    print(f"   Difference is likely statistically significant")
    print(f"   {winner} is meaningfully better")

# Visualization of comparison
print("\n" + "-"*70)
print("Visual Comparison")
print("-"*70)

print("\n Mean AUC Scores:")
print(f"   SMOTE:  {'█' * int(rf_smote_cv_mean * 50)} {rf_smote_cv_mean:.4f}")
print(f"   ADASYN: {'█' * int(rf_adasyn_cv_mean * 50)} {rf_adasyn_cv_mean:.4f}")

print(f"\n Consistency (Lower is Better):")
print(f"   SMOTE:  {'█' * int(rf_smote_cv_std * 1000)} {rf_smote_cv_std:.4f}")
print(f"   ADASYN: {'█' * int(rf_adasyn_cv_std * 1000)} {rf_adasyn_cv_std:.4f}")


# ============================================================================
# SELECT BEST MODEL FOR FINAL USE
# ============================================================================
print("\n" + "="*70)
print("SELECTING BEST RESAMPLING TECHNIQUE")
print("="*70)

# Choose best based on mean AUC (with tie-breaker on stability)
if abs(auc_difference) < 0.001:
    # Performance is essentially the same, choose more stable
    if rf_smote_cv_std < rf_adasyn_cv_std:
        best_pipeline = rf_smote_pipeline
        best_method = "SMOTE"
        best_cv_mean = rf_smote_cv_mean
        best_cv_std = rf_smote_cv_std
        best_cv_scores = cv_scores_smote
        reason = "Similar performance, but SMOTE is more stable"
    else:
        best_pipeline = rf_adasyn_pipeline
        best_method = "ADASYN"
        best_cv_mean = rf_adasyn_cv_mean
        best_cv_std = rf_adasyn_cv_std
        best_cv_scores = cv_scores_adasyn
        reason = "Similar performance, but ADASYN is more stable"
else:
    # Clear winner based on AUC
    if rf_smote_cv_mean > rf_adasyn_cv_mean:
        best_pipeline = rf_smote_pipeline
        best_method = "SMOTE"
        best_cv_mean = rf_smote_cv_mean
        best_cv_std = rf_smote_cv_std
        best_cv_scores = cv_scores_smote
        reason = f"Higher mean AUC ({rf_smote_cv_mean:.4f} vs {rf_adasyn_cv_mean:.4f})"
    else:
        best_pipeline = rf_adasyn_pipeline
        best_method = "ADASYN"
        best_cv_mean = rf_adasyn_cv_mean
        best_cv_std = rf_adasyn_cv_std
        best_cv_scores = cv_scores_adasyn
        reason = f"Higher mean AUC ({rf_adasyn_cv_mean:.4f} vs {rf_smote_cv_mean:.4f})"

print(f"\n Selected Method: {best_method}")
print(f"   Reason: {reason}")
print(f"   Performance: {best_cv_mean:.4f} +/- {best_cv_std:.4f}")

print(f"\n Storing best model for later steps:")
print(f"   Model: Random Forest + {best_method}")
print(f"   Expected AUC: {best_cv_mean:.4f}")

# Store the best results for comparison in Step 21
rf_final_pipeline = best_pipeline
rf_cv_mean = best_cv_mean
rf_cv_std = best_cv_std
rf_cv_scores = best_cv_scores
rf_best_method = best_method


# ============================================================================
# HYPERPARAMETER TUNING FOR BEST MODEL
# ============================================================================
print("\n" + "="*70)
print("HYPERPARAMETER TUNING - RANDOM FOREST")
print("="*70)

print(f"\n WHAT WE'RE DOING:")
print(f"   - Fine-tuning the best model ({best_method} + Random Forest)")
print(f"   - Testing different hyperparameter combinations")
print(f"   - Using GridSearchCV with cross-validation")
print(f"   - Goal: Improve beyond baseline {best_cv_mean:.4f} AUC")

# ----------------------------------------------------------------------------
# Define Hyperparameter Grid
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Defining Hyperparameter Search Space")
print("-"*70)

print("\n Hyperparameters to tune:")
print("   1. n_estimators (Number of trees)")
print("      - More trees = more voting experts")
print("      - But diminishing returns after certain point")
print("      - Trade-off: accuracy vs training time")
print("\n   2. max_depth (Maximum tree depth)")
print("      - Controls how deep each tree can grow")
print("      - Deeper trees = more complex patterns")
print("      - Trade-off: complexity vs overfitting")
print("\n   3. min_samples_split (Minimum samples to split node)")
print("      - How many samples needed before splitting")
print("      - Higher values = more conservative splitting")
print("      - Trade-off: generalization vs pattern detection")
print("\n   4. min_samples_leaf (Minimum samples per leaf)")
print("      - How many samples must be in final decision")
print("      - Higher values = more evidence per decision")
print("      - Trade-off: generalization vs specificity")

# Define parameter grid
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [20, 30, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

print("\n Parameter Grid:")
print(f"   n_estimators: {param_grid['classifier__n_estimators']}")
print(f"   max_depth: {param_grid['classifier__max_depth']}")
print(f"   min_samples_split: {param_grid['classifier__min_samples_split']}")
print(f"   min_samples_leaf: {param_grid['classifier__min_samples_leaf']}")
print(f"\n   Total combinations: {len(param_grid['classifier__n_estimators']) * len(param_grid['classifier__max_depth']) * len(param_grid['classifier__min_samples_split']) * len(param_grid['classifier__min_samples_leaf'])}")
print(f"   With 5-fold CV: {len(param_grid['classifier__n_estimators']) * len(param_grid['classifier__max_depth']) * len(param_grid['classifier__min_samples_split']) * len(param_grid['classifier__min_samples_leaf']) * 5} model fits")

# ----------------------------------------------------------------------------
# Run Grid Search
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Running Grid Search with Cross-Validation")
print("-"*70)

print("\n This will take 15-25 minutes...")
print("   Each combination is tested with 5-fold CV")
print("   Random Forest is computationally intensive")
print("   Progress will be shown below")

# Create GridSearchCV
grid_search = GridSearchCV(
    estimator=rf_final_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# Run grid search
start_time = time.time()
grid_search.fit(X_train_scaled, y_train)
grid_time = time.time() - start_time

# ----------------------------------------------------------------------------
# Display Results
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Grid Search Results")
print("-"*70)

print(f"\n+ Grid search complete in {grid_time:.2f} seconds ({grid_time/60:.1f} minutes)")

print(f"\n Best Parameters Found:")
for param_name, param_value in grid_search.best_params_.items():
    # Remove 'classifier__' prefix for cleaner display
    clean_name = param_name.replace('classifier__', '')
    print(f"   {clean_name}: {param_value}")

print(f"\n Performance Comparison:")
print(f"   Baseline AUC:  {best_cv_mean:.4f} (+/-{best_cv_std:.4f})")
print(f"   Tuned AUC:     {grid_search.best_score_:.4f} (+/-{grid_search.cv_results_['std_test_score'][grid_search.best_index_]:.4f})")

# Calculate improvement
improvement = grid_search.best_score_ - best_cv_mean
improvement_pct = (improvement / best_cv_mean) * 100

print(f"\n Improvement:")
if improvement > 0:
    print(f"   +{improvement:.4f} AUC ({improvement_pct:+.2f}%)")
    print(f"   + Hyperparameter tuning improved performance!")
elif improvement < -0.001:
    print(f"   {improvement:.4f} AUC ({improvement_pct:.2f}%)")
    print(f"   Tuned model performed slightly worse")
    print(f"   -> Baseline model was already well-optimized")
else:
    print(f"   ~{improvement:.4f} AUC (essentially no change)")
    print(f"   -> Baseline hyperparameters were already optimal")

# ----------------------------------------------------------------------------
# Top 5 Parameter Combinations
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Top 5 Hyperparameter Combinations")
print("-"*70)

# Get results sorted by score
results_df = pd.DataFrame(grid_search.cv_results_)
results_df = results_df.sort_values('rank_test_score')

print("\n Best performing combinations:\n")
for i in range(min(5, len(results_df))):
    row = results_df.iloc[i]
    print(f"   Rank {i+1}:")
    print(f"      n_estimators={row['param_classifier__n_estimators']}, "
          f"max_depth={row['param_classifier__max_depth']}, "
          f"min_samples_split={row['param_classifier__min_samples_split']}, "
          f"min_samples_leaf={row['param_classifier__min_samples_leaf']}")
    print(f"      Mean AUC: {row['mean_test_score']:.4f} (+/-{row['std_test_score']:.4f})")
    print()

# ----------------------------------------------------------------------------
# Store Tuned Model
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Storing Tuned Model")
print("-"*70)

# Store the tuned model
rf_tuned_pipeline = grid_search.best_estimator_
rf_tuned_cv_mean = grid_search.best_score_
rf_tuned_cv_std = grid_search.cv_results_['std_test_score'][grid_search.best_index_]
rf_best_params = grid_search.best_params_

print(f"\n+ Tuned model stored for final evaluation")
print(f"   Best hyperparameters saved")
print(f"   Expected AUC: {rf_tuned_cv_mean:.4f}")

# ----------------------------------------------------------------------------
# Decision: Use Baseline or Tuned?
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Model Selection Decision")
print("-"*70)

# Decide whether to use tuned or baseline model
if improvement > 0.0001:  # Meaningful improvement
    print(f"\n+ Using TUNED model for final evaluation")
    print(f"   Reason: Tuning improved performance by {improvement:.4f}")
    rf_final_model = rf_tuned_pipeline
    rf_final_cv_mean = rf_tuned_cv_mean
    rf_final_cv_std = rf_tuned_cv_std
    model_version = "Tuned"
else:
    print(f"\n+ Using BASELINE model for final evaluation")
    print(f"   Reason: Tuning did not provide meaningful improvement")
    print(f"   Baseline model was already well-optimized")
    rf_final_model = rf_final_pipeline
    rf_final_cv_mean = best_cv_mean
    rf_final_cv_std = best_cv_std
    model_version = "Baseline"

print(f"\n Final Random Forest Model:")
print(f"   Version: {model_version}")
print(f"   Resampling: {best_method}")
print(f"   Expected AUC: {rf_final_cv_mean:.4f} (+/-{rf_final_cv_std:.4f})")


# ============================================================================
# SAVE MODEL AND RESULTS TO DISK
# ============================================================================
print("\n" + "="*70)
print("SAVING MODEL AND RESULTS")
print("="*70)

print("\n Saving to ../models/ directory...")

# Save the final model
model_path = '../models/random_forest_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(rf_final_model, f)
print(f"   + Model saved: {model_path}")

# Save all results and metadata
results_data = {
    'model_name': 'Random Forest',
    'resampling_method': best_method,
    'model_version': model_version,
    'cv_mean': rf_final_cv_mean,
    'cv_std': rf_final_cv_std,
    'cv_scores': rf_cv_scores,
    'best_params': rf_best_params if model_version == "Tuned" else None,
    'training_samples': total_samples,
    'training_frauds': total_frauds,
    'timestamp': pd.Timestamp.now()
}

results_path = '../models/random_forest_results.pkl'
with open(results_path, 'wb') as f:
    pickle.dump(results_data, f)
print(f"   + Results saved: {results_path}")

print(f"\n+ All files saved successfully!")
print(f"   Model can now be loaded in Phase 5 for comparison")


# ============================================================================
# FINAL SUMMARY
# ============================================================================
print("\n" + "="*70)
print("STEP 18 COMPLETE - KEY ACCOMPLISHMENTS")
print("="*70)

print("\n What We Accomplished:")
print("   + Built Random Forest classifier (ensemble of 100 trees)")
print("   + Compared SMOTE vs ADASYN performance")
print("   + Used proper CV methodology (resampling inside folds)")
print("   + Validated on 100% real data (no synthetic contamination)")
print("   + Selected best resampling technique based on performance")
print("   + Performed hyperparameter tuning with GridSearchCV")
print("   + Trained final optimized model on all available training data")
print("   + SAVED model and results to disk")

print(f"\n Final Results:")
print(f"   Best Method: {best_method}")
print(f"   Model Version: {model_version}")
print(f"   Cross-Validation AUC: {rf_final_cv_mean:.4f} (+/-{rf_final_cv_std:.4f})")
print(f"   Model Stability: {'Excellent' if rf_final_cv_std < 0.01 else 'Good' if rf_final_cv_std < 0.02 else 'Moderate'}")

print("\n Saved Files:")
print(f"   - {model_path}")
print(f"   - {results_path}")

print("\n Next Steps:")
print("   -> Step 19: Build XGBoost Model")
print("   -> Step 20: Build Neural Network (Deep Learning)")
print("   -> Step 21: Compare all models and select winner")

print("\n Important Notes:")
print("   - Test set remains UNTOUCHED for final evaluation in Phase 5")
print("   - Model saved and can be loaded anytime (no need to retrain)")
print("   - Can restart PC without losing progress")
print(f"   - {best_method} + {model_version} model ready for Phase 5")

print("\n" + "="*70)


MODEL 2: RANDOM FOREST

 WHAT WE'RE DOING:
   - Building Random Forest classifier (ensemble of 100 decision trees)
   - Testing TWO resampling techniques: SMOTE vs ADASYN
   - Using 5-fold cross-validation for performance estimation
   - Applying resampling INSIDE each CV fold (proper methodology)
   - Validating on REAL data only (no synthetic frauds in validation)
   - SAVING model and results to disk for later use

PART A: RANDOM FOREST + SMOTE

 About Random Forest:
   - Ensemble of 100 decision trees voting together
   - Each tree trained on random subset of data
   - Each tree focuses on different feature combinations
   - Final prediction = majority vote from all trees
   - More powerful than Logistic Regression for complex patterns

 About SMOTE:
   - Synthetic Minority Over-sampling Technique
   - Creates synthetic samples by interpolating between existing frauds
   - Places new samples along line segments between neighbors
   - Good for general class imbalance

-------------

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.6min finished



----------------------------------------------------------------------
SMOTE Cross-Validation Results
----------------------------------------------------------------------

+ Cross-validation complete in 93.79 seconds (1.6 minutes)

 Individual Fold Scores (SMOTE):
   Fold 1: AUC = 0.9448
   Fold 2: AUC = 0.9655
   Fold 3: AUC = 0.9888
   Fold 4: AUC = 0.9675
   Fold 5: AUC = 0.9823

 SMOTE Summary Statistics:
   Mean CV AUC:  0.9698
   Std CV AUC:   0.0153
   Min CV AUC:   0.9448
   Max CV AUC:   0.9888

 95% Confidence Interval:
   0.9698 +/- 0.0299
   Range: [0.9399, 0.9997]

----------------------------------------------------------------------
Training Final SMOTE Model
----------------------------------------------------------------------

 Training final model with SMOTE on all training data...
+ SMOTE model training complete in 26.67 seconds

 SMOTE Model Summary:
   Expected AUC: 0.9698 +/- 0.0153
   Stability: Good

PART B: RANDOM FOREST + ADASYN

 About ADASYN:
   - Adapti

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.7min finished



----------------------------------------------------------------------
ADASYN Cross-Validation Results
----------------------------------------------------------------------

+ Cross-validation complete in 100.59 seconds (1.7 minutes)

 Individual Fold Scores (ADASYN):
   Fold 1: AUC = 0.9330
   Fold 2: AUC = 0.9471
   Fold 3: AUC = 0.9890
   Fold 4: AUC = 0.9670
   Fold 5: AUC = 0.9770

 ADASYN Summary Statistics:
   Mean CV AUC:  0.9626
   Std CV AUC:   0.0202
   Min CV AUC:   0.9330
   Max CV AUC:   0.9890

 95% Confidence Interval:
   0.9626 +/- 0.0396
   Range: [0.9230, 1.0022]

----------------------------------------------------------------------
Training Final ADASYN Model
----------------------------------------------------------------------

 Training final model with ADASYN on all training data...
+ ADASYN model training complete in 28.30 seconds

 ADASYN Model Summary:
   Expected AUC: 0.9626 +/- 0.0202
   Stability: Moderate

SMOTE vs ADASYN COMPARISON

 Performance Compa

In [6]:
# ============================================================================
# STEP 19: BUILD XGBOOST MODEL (WITH SMOTE AND ADASYN)
# ============================================================================

# Import additional required libraries
import pickle
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, ADASYN
import os

print("\n" + "="*70)
print("MODEL 3: XGBOOST (EXTREME GRADIENT BOOSTING)")
print("="*70)

print("\n WHAT WE'RE DOING:")
print("   - Building XGBoost classifier (sequential ensemble of trees)")
print("   - Testing TWO resampling techniques: SMOTE vs ADASYN")
print("   - Using 5-fold cross-validation for performance estimation")
print("   - Applying resampling INSIDE each CV fold (proper methodology)")
print("   - Validating on REAL data only (no synthetic frauds in validation)")
print("   - SAVING model and results to disk for later use")

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# ============================================================================
# UNDERSTANDING XGBOOST
# ============================================================================
print("\n" + "="*70)
print("ABOUT XGBOOST")
print("="*70)

print("\n What is XGBoost?")
print("   - eXtreme Gradient Boosting")
print("   - Builds trees SEQUENTIALLY (one after another)")
print("   - Each new tree learns from previous trees' mistakes")
print("   - Industry-standard algorithm for structured/tabular data")
print("   - Often wins Kaggle competitions and used in production systems")

print("\n How XGBoost Differs from Random Forest:")
print("   Random Forest:")
print("      - Builds all trees at SAME TIME (parallel)")
print("      - Trees are independent (don't talk to each other)")
print("      - Final prediction = VOTE from all trees")
print("      - Good performance, but leaves room for improvement")
print("\n   XGBoost:")
print("      - Builds trees ONE AT A TIME (sequential)")
print("      - Each tree learns from previous mistakes")
print("      - Final prediction = SUM of all tree adjustments")
print("      - Typically achieves better performance")

print("\n Why XGBoost for Fraud Detection?")
print("   + Handles imbalanced data well (our 0.173% fraud rate)")
print("   + Captures complex non-linear patterns")
print("   + Built-in regularization prevents overfitting")
print("   + Computationally efficient (uses CPU parallelization)")
print("   + Industry-proven for fraud detection tasks")

# ============================================================================
# DATA OVERVIEW
# ============================================================================
print("\n" + "="*70)
print("DATA PREPARATION")
print("="*70)

# Calculate dataset info dynamically
total_samples = len(X_train_scaled)
total_frauds = int(y_train.sum())
total_normal = total_samples - total_frauds
fraud_percentage = (total_frauds / total_samples) * 100

print(f"\n Training Data Overview:")
print(f"   Total samples: {total_samples:,}")
print(f"   Normal transactions: {total_normal:,} ({100-fraud_percentage:.3f}%)")
print(f"   Fraud transactions: {total_frauds} ({fraud_percentage:.3f}%)")
print(f"   Class imbalance ratio: {total_normal/total_frauds:.1f}:1")

print("\n Resampling Strategy:")
print("   - Testing both SMOTE and ADASYN")
print("   - Resampling applied within each CV fold")
print("   - Validation always on 100% real data (no synthetic)")


# ============================================================================
# PART A: XGBOOST WITH SMOTE
# ============================================================================
print("\n" + "="*70)
print("PART A: XGBOOST + SMOTE")
print("="*70)

print("\n About XGBoost:")
print("   - Sequential gradient boosting algorithm")
print("   - Each tree corrects mistakes of previous trees")
print("   - Combines predictions through weighted sum")
print("   - More sophisticated than Random Forest")
print("   - Often achieves best performance on tabular data")

print("\n About SMOTE:")
print("   - Synthetic Minority Over-sampling Technique")
print("   - Creates synthetic samples by interpolating between existing frauds")
print("   - Places new samples along line segments between neighbors")
print("   - Good for general class imbalance")

# ----------------------------------------------------------------------------
# Create SMOTE Pipeline
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Creating SMOTE Pipeline")
print("-"*70)

print("\n Building pipeline with 2 steps:")
print("   Step 1: SMOTE (balance training data)")
print("   Step 2: XGBoost (gradient boosting classification)")

# Create SMOTE pipeline
xgb_smote_pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(
        random_state=42,
        eval_metric='logloss',
        use_label_encoder=False,
        n_jobs=-1,
        verbosity=0
    ))
])

print("\n+ SMOTE pipeline created successfully!")

# ----------------------------------------------------------------------------
# Cross-Validation with SMOTE
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Running 5-Fold Cross-Validation with SMOTE")
print("-"*70)

samples_per_fold = total_samples // 5
frauds_per_fold = total_frauds // 5

print(f"\n Running 5-fold cross-validation...")
print(f"   Dataset: X_train_scaled ({total_samples:,} samples, {total_frauds} frauds)")
print(f"   Each fold: ~{samples_per_fold:,} samples (~{frauds_per_fold} real frauds)")
print(f"   (This will take 2-4 minutes)")

start_time = time.time()

# Perform 5-fold CV with SMOTE
cv_scores_smote = cross_val_score(
    xgb_smote_pipeline,
    X_train_scaled,
    y_train,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

cv_time_smote = time.time() - start_time

# Display SMOTE results
print("\n" + "-"*70)
print("SMOTE Cross-Validation Results")
print("-"*70)

print(f"\n+ Cross-validation complete in {cv_time_smote:.2f} seconds ({cv_time_smote/60:.1f} minutes)")

print(f"\n Individual Fold Scores (SMOTE):")
for i, score in enumerate(cv_scores_smote, 1):
    print(f"   Fold {i}: AUC = {score:.4f}")

print(f"\n SMOTE Summary Statistics:")
print(f"   Mean CV AUC:  {cv_scores_smote.mean():.4f}")
print(f"   Std CV AUC:   {cv_scores_smote.std():.4f}")
print(f"   Min CV AUC:   {cv_scores_smote.min():.4f}")
print(f"   Max CV AUC:   {cv_scores_smote.max():.4f}")

# Calculate 95% confidence interval
confidence_interval_smote = 1.96 * cv_scores_smote.std()
print(f"\n 95% Confidence Interval:")
print(f"   {cv_scores_smote.mean():.4f} +/- {confidence_interval_smote:.4f}")
print(f"   Range: [{cv_scores_smote.mean() - confidence_interval_smote:.4f}, "
      f"{cv_scores_smote.mean() + confidence_interval_smote:.4f}]")

# ----------------------------------------------------------------------------
# Train Final SMOTE Model
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Training Final SMOTE Model")
print("-"*70)

print("\n Training final model with SMOTE on all training data...")

start_time = time.time()
xgb_smote_pipeline.fit(X_train_scaled, y_train)
train_time_smote = time.time() - start_time

print(f"+ SMOTE model training complete in {train_time_smote:.2f} seconds")

# Store SMOTE results
xgb_smote_cv_mean = cv_scores_smote.mean()
xgb_smote_cv_std = cv_scores_smote.std()

print(f"\n SMOTE Model Summary:")
print(f"   Expected AUC: {xgb_smote_cv_mean:.4f} +/- {xgb_smote_cv_std:.4f}")
print(f"   Stability: {'Excellent' if xgb_smote_cv_std < 0.01 else 'Good' if xgb_smote_cv_std < 0.02 else 'Moderate'}")


# ============================================================================
# PART B: XGBOOST WITH ADASYN
# ============================================================================
print("\n" + "="*70)
print("PART B: XGBOOST + ADASYN")
print("="*70)

print("\n About ADASYN:")
print("   - Adaptive Synthetic Sampling")
print("   - Focuses on harder-to-learn minority samples")
print("   - Creates MORE synthetics near decision boundary (harder cases)")
print("   - Creates FEWER synthetics in easy regions")
print("   - Better for complex, varied fraud patterns")

# ----------------------------------------------------------------------------
# Create ADASYN Pipeline
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Creating ADASYN Pipeline")
print("-"*70)

print("\n Building pipeline with 2 steps:")
print("   Step 1: ADASYN (adaptive balance training data)")
print("   Step 2: XGBoost (gradient boosting classification)")

# Create ADASYN pipeline
xgb_adasyn_pipeline = ImbPipeline([
    ('adasyn', ADASYN(random_state=42)),
    ('classifier', XGBClassifier(
        random_state=42,
        eval_metric='logloss',
        use_label_encoder=False,
        n_jobs=-1,
        verbosity=0
    ))
])

print("\n+ ADASYN pipeline created successfully!")

# ----------------------------------------------------------------------------
# Cross-Validation with ADASYN
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Running 5-Fold Cross-Validation with ADASYN")
print("-"*70)

print(f"\n Running 5-fold cross-validation...")
print(f"   Dataset: X_train_scaled ({total_samples:,} samples, {total_frauds} frauds)")
print(f"   ADASYN will adaptively generate synthetics based on difficulty")
print(f"   (This will take 2-4 minutes)")

start_time = time.time()

# Perform 5-fold CV with ADASYN
cv_scores_adasyn = cross_val_score(
    xgb_adasyn_pipeline,
    X_train_scaled,
    y_train,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

cv_time_adasyn = time.time() - start_time

# Display ADASYN results
print("\n" + "-"*70)
print("ADASYN Cross-Validation Results")
print("-"*70)

print(f"\n+ Cross-validation complete in {cv_time_adasyn:.2f} seconds ({cv_time_adasyn/60:.1f} minutes)")

print(f"\n Individual Fold Scores (ADASYN):")
for i, score in enumerate(cv_scores_adasyn, 1):
    print(f"   Fold {i}: AUC = {score:.4f}")

print(f"\n ADASYN Summary Statistics:")
print(f"   Mean CV AUC:  {cv_scores_adasyn.mean():.4f}")
print(f"   Std CV AUC:   {cv_scores_adasyn.std():.4f}")
print(f"   Min CV AUC:   {cv_scores_adasyn.min():.4f}")
print(f"   Max CV AUC:   {cv_scores_adasyn.max():.4f}")

# Calculate 95% confidence interval
confidence_interval_adasyn = 1.96 * cv_scores_adasyn.std()
print(f"\n 95% Confidence Interval:")
print(f"   {cv_scores_adasyn.mean():.4f} +/- {confidence_interval_adasyn:.4f}")
print(f"   Range: [{cv_scores_adasyn.mean() - confidence_interval_adasyn:.4f}, "
      f"{cv_scores_adasyn.mean() + confidence_interval_adasyn:.4f}]")

# ----------------------------------------------------------------------------
# Train Final ADASYN Model
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Training Final ADASYN Model")
print("-"*70)

print("\n Training final model with ADASYN on all training data...")

start_time = time.time()
xgb_adasyn_pipeline.fit(X_train_scaled, y_train)
train_time_adasyn = time.time() - start_time

print(f"+ ADASYN model training complete in {train_time_adasyn:.2f} seconds")

# Store ADASYN results
xgb_adasyn_cv_mean = cv_scores_adasyn.mean()
xgb_adasyn_cv_std = cv_scores_adasyn.std()

print(f"\n ADASYN Model Summary:")
print(f"   Expected AUC: {xgb_adasyn_cv_mean:.4f} +/- {xgb_adasyn_cv_std:.4f}")
print(f"   Stability: {'Excellent' if xgb_adasyn_cv_std < 0.01 else 'Good' if xgb_adasyn_cv_std < 0.02 else 'Moderate'}")


# ============================================================================
# COMPARISON: SMOTE vs ADASYN
# ============================================================================
print("\n" + "="*70)
print("SMOTE vs ADASYN COMPARISON")
print("="*70)

print(f"\n Performance Comparison:")
print(f"   SMOTE  Mean AUC: {xgb_smote_cv_mean:.4f} (+/-{xgb_smote_cv_std:.4f})")
print(f"   ADASYN Mean AUC: {xgb_adasyn_cv_mean:.4f} (+/-{xgb_adasyn_cv_std:.4f})")

# Calculate difference
auc_difference = xgb_adasyn_cv_mean - xgb_smote_cv_mean
print(f"\n   Difference: {abs(auc_difference):.4f} ({'ADASYN better' if auc_difference > 0 else 'SMOTE better'})")

# Determine winner
if abs(auc_difference) < 0.001:
    winner = "TIE - Performance essentially identical"
    recommendation = "Either technique is suitable - choose based on training time"
elif auc_difference > 0:
    winner = "ADASYN WINS"
    recommendation = "ADASYN provides better fraud detection for this dataset"
else:
    winner = "SMOTE WINS"
    recommendation = "SMOTE provides better fraud detection for this dataset"

print(f"\n Winner: {winner}")
print(f"\n Recommendation: {recommendation}")

print(f"\n Training Time Comparison:")
print(f"   SMOTE:  {cv_time_smote:.2f} seconds ({cv_time_smote/60:.1f} minutes)")
print(f"   ADASYN: {cv_time_adasyn:.2f} seconds ({cv_time_adasyn/60:.1f} minutes)")
print(f"   Difference: {abs(cv_time_smote - cv_time_adasyn):.2f} seconds ({'ADASYN slower' if cv_time_adasyn > cv_time_smote else 'SMOTE slower'})")

print(f"\n Stability Comparison:")
print(f"   SMOTE  Std: {xgb_smote_cv_std:.4f} ({'More stable' if xgb_smote_cv_std < xgb_adasyn_cv_std else 'Less stable'})")
print(f"   ADASYN Std: {xgb_adasyn_cv_std:.4f} ({'More stable' if xgb_adasyn_cv_std < xgb_smote_cv_std else 'Less stable'})")

# Statistical significance test
print(f"\n Statistical Significance:")
smote_lower = xgb_smote_cv_mean - confidence_interval_smote
smote_upper = xgb_smote_cv_mean + confidence_interval_smote
adasyn_lower = xgb_adasyn_cv_mean - confidence_interval_adasyn
adasyn_upper = xgb_adasyn_cv_mean + confidence_interval_adasyn

if (smote_lower <= adasyn_upper) and (adasyn_lower <= smote_upper):
    print(f"   Confidence intervals overlap")
    print(f"   Difference may not be statistically significant")
    print(f"   Both methods perform similarly on this dataset")
else:
    print(f"   + Confidence intervals don't overlap")
    print(f"   Difference is likely statistically significant")
    print(f"   {winner} is meaningfully better")

# Visualization of comparison
print("\n" + "-"*70)
print("Visual Comparison")
print("-"*70)

print("\n Mean AUC Scores:")
print(f"   SMOTE:  {'█' * int(xgb_smote_cv_mean * 50)} {xgb_smote_cv_mean:.4f}")
print(f"   ADASYN: {'█' * int(xgb_adasyn_cv_mean * 50)} {xgb_adasyn_cv_mean:.4f}")

print(f"\n Consistency (Lower is Better):")
print(f"   SMOTE:  {'█' * int(xgb_smote_cv_std * 1000)} {xgb_smote_cv_std:.4f}")
print(f"   ADASYN: {'█' * int(xgb_adasyn_cv_std * 1000)} {xgb_adasyn_cv_std:.4f}")


# ============================================================================
# SELECT BEST MODEL FOR FINAL USE
# ============================================================================
print("\n" + "="*70)
print("SELECTING BEST RESAMPLING TECHNIQUE")
print("="*70)

# Choose best based on mean AUC (with tie-breaker on stability)
if abs(auc_difference) < 0.001:
    # Performance is essentially the same, choose more stable
    if xgb_smote_cv_std < xgb_adasyn_cv_std:
        best_pipeline = xgb_smote_pipeline
        best_method = "SMOTE"
        best_cv_mean = xgb_smote_cv_mean
        best_cv_std = xgb_smote_cv_std
        best_cv_scores = cv_scores_smote
        reason = "Similar performance, but SMOTE is more stable"
    else:
        best_pipeline = xgb_adasyn_pipeline
        best_method = "ADASYN"
        best_cv_mean = xgb_adasyn_cv_mean
        best_cv_std = xgb_adasyn_cv_std
        best_cv_scores = cv_scores_adasyn
        reason = "Similar performance, but ADASYN is more stable"
else:
    # Clear winner based on AUC
    if xgb_smote_cv_mean > xgb_adasyn_cv_mean:
        best_pipeline = xgb_smote_pipeline
        best_method = "SMOTE"
        best_cv_mean = xgb_smote_cv_mean
        best_cv_std = xgb_smote_cv_std
        best_cv_scores = cv_scores_smote
        reason = f"Higher mean AUC ({xgb_smote_cv_mean:.4f} vs {xgb_adasyn_cv_mean:.4f})"
    else:
        best_pipeline = xgb_adasyn_pipeline
        best_method = "ADASYN"
        best_cv_mean = xgb_adasyn_cv_mean
        best_cv_std = xgb_adasyn_cv_std
        best_cv_scores = cv_scores_adasyn
        reason = f"Higher mean AUC ({xgb_adasyn_cv_mean:.4f} vs {xgb_smote_cv_mean:.4f})"

print(f"\n Selected Method: {best_method}")
print(f"   Reason: {reason}")
print(f"   Performance: {best_cv_mean:.4f} +/- {best_cv_std:.4f}")

print(f"\n Storing best model for later steps:")
print(f"   Model: XGBoost + {best_method}")
print(f"   Expected AUC: {best_cv_mean:.4f}")

# Store the best results for comparison in Step 21
xgb_final_pipeline = best_pipeline
xgb_cv_mean = best_cv_mean
xgb_cv_std = best_cv_std
xgb_cv_scores = best_cv_scores
xgb_best_method = best_method


# ============================================================================
# HYPERPARAMETER TUNING FOR BEST MODEL
# ============================================================================
print("\n" + "="*70)
print("HYPERPARAMETER TUNING - XGBOOST")
print("="*70)

print(f"\n WHAT WE'RE DOING:")
print(f"   - Fine-tuning the best model ({best_method} + XGBoost)")
print(f"   - Testing different hyperparameter combinations")
print(f"   - Using GridSearchCV with cross-validation")
print(f"   - Goal: Improve beyond baseline {best_cv_mean:.4f} AUC")

# ----------------------------------------------------------------------------
# Define Hyperparameter Grid
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Defining Hyperparameter Search Space")
print("-"*70)

print("\n Hyperparameters to tune:")
print("   1. n_estimators (Number of boosting rounds)")
print("      - Number of sequential trees to build")
print("      - More trees = more learning iterations")
print("      - Trade-off: accuracy vs training time")
print("\n   2. max_depth (Maximum tree depth)")
print("      - Controls how deep each tree can grow")
print("      - Deeper trees = more complex patterns")
print("      - Trade-off: complexity vs overfitting")
print("\n   3. learning_rate (Step size for each tree)")
print("      - How much each tree contributes to final prediction")
print("      - Smaller rate = more conservative learning")
print("      - Trade-off: accuracy vs training time")
print("\n   4. subsample (Fraction of samples per tree)")
print("      - Percentage of data used to train each tree")
print("      - Lower values = more randomness, less overfitting")
print("      - Trade-off: generalization vs training stability")

# Define parameter grid
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.3],
    'classifier__subsample': [0.8, 1.0]
}

print("\n Parameter Grid:")
print(f"   n_estimators: {param_grid['classifier__n_estimators']}")
print(f"   max_depth: {param_grid['classifier__max_depth']}")
print(f"   learning_rate: {param_grid['classifier__learning_rate']}")
print(f"   subsample: {param_grid['classifier__subsample']}")
print(f"\n   Total combinations: {len(param_grid['classifier__n_estimators']) * len(param_grid['classifier__max_depth']) * len(param_grid['classifier__learning_rate']) * len(param_grid['classifier__subsample'])}")
print(f"   With 5-fold CV: {len(param_grid['classifier__n_estimators']) * len(param_grid['classifier__max_depth']) * len(param_grid['classifier__learning_rate']) * len(param_grid['classifier__subsample']) * 5} model fits")

# ----------------------------------------------------------------------------
# Run Grid Search
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Running Grid Search with Cross-Validation")
print("-"*70)

print("\n This will take 10-15 minutes...")
print("   Each combination is tested with 5-fold CV")
print("   XGBoost is computationally intensive")
print("   Progress will be shown below")

# Create GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_final_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# Run grid search
start_time = time.time()
grid_search.fit(X_train_scaled, y_train)
grid_time = time.time() - start_time

# ----------------------------------------------------------------------------
# Display Results
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Grid Search Results")
print("-"*70)

print(f"\n+ Grid search complete in {grid_time:.2f} seconds ({grid_time/60:.1f} minutes)")

print(f"\n Best Parameters Found:")
for param_name, param_value in grid_search.best_params_.items():
    # Remove 'classifier__' prefix for cleaner display
    clean_name = param_name.replace('classifier__', '')
    print(f"   {clean_name}: {param_value}")

print(f"\n Performance Comparison:")
print(f"   Baseline AUC:  {best_cv_mean:.4f} (+/-{best_cv_std:.4f})")
print(f"   Tuned AUC:     {grid_search.best_score_:.4f} (+/-{grid_search.cv_results_['std_test_score'][grid_search.best_index_]:.4f})")

# Calculate improvement
improvement = grid_search.best_score_ - best_cv_mean
improvement_pct = (improvement / best_cv_mean) * 100

print(f"\n Improvement:")
if improvement > 0:
    print(f"   +{improvement:.4f} AUC ({improvement_pct:+.2f}%)")
    print(f"   + Hyperparameter tuning improved performance!")
elif improvement < -0.001:
    print(f"   {improvement:.4f} AUC ({improvement_pct:.2f}%)")
    print(f"   Tuned model performed slightly worse")
    print(f"   -> Baseline model was already well-optimized")
else:
    print(f"   ~{improvement:.4f} AUC (essentially no change)")
    print(f"   -> Baseline hyperparameters were already optimal")

# ----------------------------------------------------------------------------
# Top 5 Parameter Combinations
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Top 5 Hyperparameter Combinations")
print("-"*70)

# Get results sorted by score
results_df = pd.DataFrame(grid_search.cv_results_)
results_df = results_df.sort_values('rank_test_score')

print("\n Best performing combinations:\n")
for i in range(min(5, len(results_df))):
    row = results_df.iloc[i]
    print(f"   Rank {i+1}:")
    print(f"      n_estimators={row['param_classifier__n_estimators']}, "
          f"max_depth={row['param_classifier__max_depth']}, "
          f"learning_rate={row['param_classifier__learning_rate']}, "
          f"subsample={row['param_classifier__subsample']}")
    print(f"      Mean AUC: {row['mean_test_score']:.4f} (+/-{row['std_test_score']:.4f})")
    print()

# ----------------------------------------------------------------------------
# Store Tuned Model
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Storing Tuned Model")
print("-"*70)

# Store the tuned model
xgb_tuned_pipeline = grid_search.best_estimator_
xgb_tuned_cv_mean = grid_search.best_score_
xgb_tuned_cv_std = grid_search.cv_results_['std_test_score'][grid_search.best_index_]
xgb_best_params = grid_search.best_params_

print(f"\n+ Tuned model stored for final evaluation")
print(f"   Best hyperparameters saved")
print(f"   Expected AUC: {xgb_tuned_cv_mean:.4f}")

# ----------------------------------------------------------------------------
# Decision: Use Baseline or Tuned?
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Model Selection Decision")
print("-"*70)

# Decide whether to use tuned or baseline model
if improvement > 0.0001:  # Meaningful improvement
    print(f"\n+ Using TUNED model for final evaluation")
    print(f"   Reason: Tuning improved performance by {improvement:.4f}")
    xgb_final_model = xgb_tuned_pipeline
    xgb_final_cv_mean = xgb_tuned_cv_mean
    xgb_final_cv_std = xgb_tuned_cv_std
    model_version = "Tuned"
else:
    print(f"\n+ Using BASELINE model for final evaluation")
    print(f"   Reason: Tuning did not provide meaningful improvement")
    print(f"   Baseline model was already well-optimized")
    xgb_final_model = xgb_final_pipeline
    xgb_final_cv_mean = best_cv_mean
    xgb_final_cv_std = best_cv_std
    model_version = "Baseline"

print(f"\n Final XGBoost Model:")
print(f"   Version: {model_version}")
print(f"   Resampling: {best_method}")
print(f"   Expected AUC: {xgb_final_cv_mean:.4f} (+/-{xgb_final_cv_std:.4f})")


# ============================================================================
# SAVE MODEL AND RESULTS TO DISK
# ============================================================================
print("\n" + "="*70)
print("SAVING MODEL AND RESULTS")
print("="*70)

print("\n Saving to ../models/ directory...")

# Save the final model
model_path = '../models/xgboost_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(xgb_final_model, f)
print(f"   + Model saved: {model_path}")

# Save all results and metadata
results_data = {
    'model_name': 'XGBoost',
    'resampling_method': best_method,
    'model_version': model_version,
    'cv_mean': xgb_final_cv_mean,
    'cv_std': xgb_final_cv_std,
    'cv_scores': xgb_cv_scores,
    'best_params': xgb_best_params if model_version == "Tuned" else None,
    'training_samples': total_samples,
    'training_frauds': total_frauds,
    'timestamp': pd.Timestamp.now()
}

results_path = '../models/xgboost_results.pkl'
with open(results_path, 'wb') as f:
    pickle.dump(results_data, f)
print(f"   + Results saved: {results_path}")

print(f"\n+ All files saved successfully!")
print(f"   Model can now be loaded in Phase 5 for comparison")


# ============================================================================
# FINAL SUMMARY
# ============================================================================
print("\n" + "="*70)
print("STEP 19 COMPLETE - KEY ACCOMPLISHMENTS")
print("="*70)

print("\n What We Accomplished:")
print("   + Built XGBoost classifier (gradient boosting)")
print("   + Compared SMOTE vs ADASYN performance")
print("   + Used proper CV methodology (resampling inside folds)")
print("   + Validated on 100% real data (no synthetic contamination)")
print("   + Selected best resampling technique based on performance")
print("   + Performed hyperparameter tuning with GridSearchCV")
print("   + Trained final optimized model on all available training data")
print("   + SAVED model and results to disk")

print(f"\n Final Results:")
print(f"   Best Method: {best_method}")
print(f"   Model Version: {model_version}")
print(f"   Cross-Validation AUC: {xgb_final_cv_mean:.4f} (+/-{xgb_final_cv_std:.4f})")
print(f"   Model Stability: {'Excellent' if xgb_final_cv_std < 0.01 else 'Good' if xgb_final_cv_std < 0.02 else 'Moderate'}")

print("\n Saved Files:")
print(f"   - {model_path}")
print(f"   - {results_path}")

print("\n Next Steps:")
print("   -> Step 20: Build Neural Network (Deep Learning with GPU)")
print("   -> Step 21: Compare all models and select winner")

print("\n Important Notes:")
print("   - Test set remains UNTOUCHED for final evaluation in Phase 5")
print("   - Model saved and can be loaded anytime (no need to retrain)")
print("   - Can restart PC without losing progress")
print(f"   - {best_method} + {model_version} model ready for Phase 5")
print(f"   - All models trained with consistent methodology for fair comparison")

print("\n" + "="*70)


MODEL 3: XGBOOST (EXTREME GRADIENT BOOSTING)

 WHAT WE'RE DOING:
   - Building XGBoost classifier (sequential ensemble of trees)
   - Testing TWO resampling techniques: SMOTE vs ADASYN
   - Using 5-fold cross-validation for performance estimation
   - Applying resampling INSIDE each CV fold (proper methodology)
   - Validating on REAL data only (no synthetic frauds in validation)
   - SAVING model and results to disk for later use

ABOUT XGBOOST

 What is XGBoost?
   - eXtreme Gradient Boosting
   - Builds trees SEQUENTIALLY (one after another)
   - Each new tree learns from previous trees' mistakes
   - Industry-standard algorithm for structured/tabular data
   - Often wins Kaggle competitions and used in production systems

 How XGBoost Differs from Random Forest:
   Random Forest:
      - Builds all trees at SAME TIME (parallel)
      - Trees are independent (don't talk to each other)
      - Final prediction = VOTE from all trees
      - Good performance, but leaves room for impro

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.5s finished



----------------------------------------------------------------------
SMOTE Cross-Validation Results
----------------------------------------------------------------------

+ Cross-validation complete in 6.70 seconds (0.1 minutes)

 Individual Fold Scores (SMOTE):
   Fold 1: AUC = 0.9548
   Fold 2: AUC = 0.9724
   Fold 3: AUC = 0.9961
   Fold 4: AUC = 0.9760
   Fold 5: AUC = 0.9797

 SMOTE Summary Statistics:
   Mean CV AUC:  0.9758
   Std CV AUC:   0.0133
   Min CV AUC:   0.9548
   Max CV AUC:   0.9961

 95% Confidence Interval:
   0.9758 +/- 0.0260
   Range: [0.9498, 1.0018]

----------------------------------------------------------------------
Training Final SMOTE Model
----------------------------------------------------------------------

 Training final model with SMOTE on all training data...
+ SMOTE model training complete in 1.13 seconds

 SMOTE Model Summary:
   Expected AUC: 0.9758 +/- 0.0133
   Stability: Good

PART B: XGBOOST + ADASYN

 About ADASYN:
   - Adaptive Synth

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.6s finished



----------------------------------------------------------------------
ADASYN Cross-Validation Results
----------------------------------------------------------------------

+ Cross-validation complete in 5.87 seconds (0.1 minutes)

 Individual Fold Scores (ADASYN):
   Fold 1: AUC = 0.9489
   Fold 2: AUC = 0.9688
   Fold 3: AUC = 0.9962
   Fold 4: AUC = 0.9749
   Fold 5: AUC = 0.9841

 ADASYN Summary Statistics:
   Mean CV AUC:  0.9746
   Std CV AUC:   0.0158
   Min CV AUC:   0.9489
   Max CV AUC:   0.9962

 95% Confidence Interval:
   0.9746 +/- 0.0310
   Range: [0.9436, 1.0056]

----------------------------------------------------------------------
Training Final ADASYN Model
----------------------------------------------------------------------

 Training final model with ADASYN on all training data...
+ ADASYN model training complete in 0.98 seconds

 ADASYN Model Summary:
   Expected AUC: 0.9746 +/- 0.0158
   Stability: Good

SMOTE vs ADASYN COMPARISON

 Performance Comparison:


In [8]:
# ============================================================================
# STEP 20: BUILD NEURAL NETWORK (DEEP LEARNING) WITH SMOTE AND ADASYN
# ============================================================================

# Import required libraries for Neural Network and saving
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE, ADASYN
import os
import numpy as np

print("\n" + "="*70)
print("MODEL 4: NEURAL NETWORK (DEEP LEARNING)")
print("="*70)

print("\n WHAT WE'RE DOING:")
print("   - Building Neural Network classifier (deep learning)")
print("   - Testing TWO resampling techniques: SMOTE vs ADASYN")
print("   - Using manual 5-fold cross-validation (PyTorch requirement)")
print("   - Applying resampling INSIDE each CV fold (proper methodology)")
print("   - Validating on REAL data only (no synthetic frauds in validation)")
print("   - Leveraging GPU acceleration for faster training")
print("   - SAVING model and results to disk for later use")
print("   - TRACKING training history for visualization")

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# ============================================================================
# UNDERSTANDING NEURAL NETWORKS
# ============================================================================
print("\n" + "="*70)
print("ABOUT NEURAL NETWORKS")
print("="*70)

print("\n What is a Neural Network?")
print("   - Deep learning model inspired by human brain")
print("   - Multiple layers of interconnected 'neurons'")
print("   - Each layer learns increasingly complex patterns")
print("   - Can discover non-linear relationships in data")
print("   - State-of-the-art for many complex pattern recognition tasks")

print("\n How Neural Networks Differ from Other Models:")
print("   Logistic Regression:")
print("      - Linear decision boundary (straight line)")
print("      - Fast and interpretable")
print("      - Limited to simple patterns")
print("\n   Random Forest:")
print("      - Parallel decision trees voting")
print("      - Good for tabular data")
print("      - Non-linear but limited complexity")
print("\n   XGBoost:")
print("      - Sequential trees learning from mistakes")
print("      - Excellent for structured data")
print("      - Non-linear with regularization")
print("\n   Neural Network:")
print("      - Multiple layers transforming data")
print("      - Can learn VERY complex patterns")
print("      - Requires more data and computational power")
print("      - Most flexible but hardest to interpret")

print("\n Why Neural Networks for Fraud Detection?")
print("   + Large dataset (284K+ transactions) - NN needs lots of data")
print("   + Complex fraud patterns that may be non-obvious")
print("   + Can learn hierarchical features automatically")
print("   + GPU acceleration available (your RTX 5070 Ti)")
print("   + Industry-standard for advanced fraud detection")
print("   + Shows portfolio depth beyond traditional ML")

# ============================================================================
# GPU CONFIGURATION CHECK
# ============================================================================
print("\n" + "="*70)
print("GPU CONFIGURATION")
print("="*70)

# Check GPU availability and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n GPU Status:")
if torch.cuda.is_available():
    print(f"   + GPU Available: YES")
    print(f"   GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"   Training will be ACCELERATED!")
    print(f"   RTX 5070 Ti will significantly speed up training")
else:
    print(f"   - GPU Available: NO")
    print(f"   Training will run on CPU (slower but functional)")
    print(f"   Consider checking GPU drivers and CUDA installation")

# ============================================================================
# DATA OVERVIEW
# ============================================================================
print("\n" + "="*70)
print("DATA PREPARATION")
print("="*70)

# Calculate dataset info dynamically
total_samples = len(X_train_scaled)
total_frauds = int(y_train.sum())
total_normal = total_samples - total_frauds
fraud_percentage = (total_frauds / total_samples) * 100

print(f"\n Training Data Overview:")
print(f"   Total samples: {total_samples:,}")
print(f"   Normal transactions: {total_normal:,} ({100-fraud_percentage:.3f}%)")
print(f"   Fraud transactions: {total_frauds} ({fraud_percentage:.3f}%)")
print(f"   Class imbalance ratio: {total_normal/total_frauds:.1f}:1")

print("\n Resampling Strategy:")
print("   - Testing both SMOTE and ADASYN")
print("   - Resampling applied within each CV fold")
print("   - Validation always on 100% real data (no synthetic)")

# ============================================================================
# NEURAL NETWORK ARCHITECTURE
# ============================================================================
print("\n" + "="*70)
print("NEURAL NETWORK ARCHITECTURE")
print("="*70)

print("\n Network Structure:")
print("   Layer 1 (Input):  128 neurons + Dropout(0.3) + BatchNorm")
print("   Layer 2 (Hidden): 64 neurons  + Dropout(0.3) + BatchNorm")
print("   Layer 3 (Hidden): 32 neurons  + Dropout(0.2)")
print("   Layer 4 (Output): 1 neuron    (fraud probability)")
print("\n   Total: 4 layers (deep network)")
print("   Activation: ReLU (hidden layers), Sigmoid (output)")
print("   Optimizer: Adam (adaptive learning rate)")
print("   Loss: Binary Cross Entropy (for binary classification)")

class FraudDetectionNN(nn.Module):
    """
    4-layer Neural Network for fraud detection using PyTorch
    
    Architecture:
    - Input Layer: hidden_units neurons with dropout and batch normalization
    - Hidden Layer 1: hidden_units/2 neurons with dropout and batch normalization
    - Hidden Layer 2: hidden_units/4 neurons with dropout
    - Output Layer: 1 neuron with sigmoid activation
    
    Parameters:
    -----------
    input_dim : int
        Number of input features (30 in our case)
    hidden_units : int
        Number of neurons in first hidden layer (default 128)
    dropout : float
        Dropout rate for regularization (default 0.3)
    """
    def __init__(self, input_dim, hidden_units=128, dropout=0.3):
        super(FraudDetectionNN, self).__init__()
        
        # Calculate layer sizes
        layer2_units = hidden_units // 2
        layer3_units = hidden_units // 4
        
        # Layer 1: Learn basic patterns
        self.fc1 = nn.Linear(input_dim, hidden_units)
        self.bn1 = nn.BatchNorm1d(hidden_units)
        self.dropout1 = nn.Dropout(dropout)
        
        # Layer 2: Combine patterns
        self.fc2 = nn.Linear(hidden_units, layer2_units)
        self.bn2 = nn.BatchNorm1d(layer2_units)
        self.dropout2 = nn.Dropout(dropout)
        
        # Layer 3: Refine patterns
        self.fc3 = nn.Linear(layer2_units, layer3_units)
        self.dropout3 = nn.Dropout(dropout * 0.67)
        
        # Layer 4: Final decision
        self.fc4 = nn.Linear(layer3_units, 1)
        
    def forward(self, x):
        """Forward pass through the network"""
        # Layer 1
        x = self.fc1(x)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.dropout1(x)
        
        # Layer 2
        x = self.fc2(x)
        x = self.bn2(x)
        x = torch.relu(x)
        x = self.dropout2(x)
        
        # Layer 3
        x = self.fc3(x)
        x = torch.relu(x)
        x = self.dropout3(x)
        
        # Layer 4
        x = self.fc4(x)
        x = torch.sigmoid(x)
        
        return x

def train_neural_network(model, train_loader, val_loader, epochs=20, patience=5, learning_rate=0.001, device='cpu', track_history=False):
    """
    Train neural network with early stopping
    
    Parameters:
    -----------
    model : FraudDetectionNN
        The neural network model
    train_loader : DataLoader
        Training data loader
    val_loader : DataLoader
        Validation data loader
    epochs : int
        Maximum number of epochs
    patience : int
        Early stopping patience
    learning_rate : float
        Learning rate for optimizer
    device : str
        Device to train on ('cuda' or 'cpu')
    track_history : bool
        Whether to track training history (loss, AUC per epoch)
    
    Returns:
    --------
    model : FraudDetectionNN
        Trained model
    best_val_auc : float
        Best validation AUC achieved
    stopped_epoch : int
        Epoch where training stopped
    history : dict (if track_history=True)
        Training history with epoch, train_loss, val_auc
    """
    # Loss function and optimizer
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Early stopping variables
    best_val_auc = 0
    patience_counter = 0
    best_model_state = None
    stopped_epoch = 0
    
    # Training history tracking
    history = {
        'epoch': [],
        'train_loss': [],
        'val_auc': []
    } if track_history else None
    
    # Training loop
    for epoch in range(epochs):
        # Training phase
        model.train()
        epoch_loss = 0.0
        batch_count = 0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            # Forward pass
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            if track_history:
                epoch_loss += loss.item()
                batch_count += 1
        
        # Validation phase
        model.eval()
        val_predictions = []
        val_labels = []
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(device)
                outputs = model(X_batch)
                val_predictions.extend(outputs.cpu().numpy())
                val_labels.extend(y_batch.numpy())
        
        # Calculate validation AUC
        val_auc = roc_auc_score(val_labels, val_predictions)
        
        # Track history if requested
        if track_history:
            history['epoch'].append(epoch + 1)
            history['train_loss'].append(epoch_loss / batch_count)
            history['val_auc'].append(val_auc)
        
        # Early stopping check
        if val_auc > best_val_auc:
            best_val_auc = val_auc
            patience_counter = 0
            best_model_state = model.state_dict().copy()
            stopped_epoch = epoch + 1
        else:
            patience_counter += 1
            
        if patience_counter >= patience:
            # Restore best weights
            model.load_state_dict(best_model_state)
            stopped_epoch = epoch + 1 - patience
            break
    
    if track_history:
        return model, best_val_auc, stopped_epoch, history
    else:
        return model, best_val_auc, stopped_epoch

print("\n+ Neural network architecture defined")

# ============================================================================
# PART A: NEURAL NETWORK WITH SMOTE
# ============================================================================
print("\n" + "="*70)
print("PART A: NEURAL NETWORK + SMOTE")
print("="*70)

print("\n About SMOTE:")
print("   - Synthetic Minority Over-sampling Technique")
print("   - Creates synthetic samples by interpolating between existing frauds")
print("   - Places new samples along line segments between neighbors")
print("   - Good for general class imbalance")

# ----------------------------------------------------------------------------
# Manual K-Fold Cross-Validation with SMOTE
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Running 5-Fold Cross-Validation with SMOTE")
print("-"*70)

print(f"\n Running manual 5-fold cross-validation...")
print(f"   Why manual? Neural networks need to be rebuilt for each fold")
print(f"   sklearn's cross_val_score doesn't work with PyTorch")
print(f"   Dataset: X_train_scaled ({total_samples:,} samples, {total_frauds} frauds)")
print(f"   Each fold: ~{total_samples // 5:,} samples (~{total_frauds // 5} real frauds)")
print(f"   This will take 10-15 minutes (depending on GPU)")

# Initialize K-Fold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores_smote = []

start_time = time.time()

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_scaled), 1):
    print(f"\n    Training fold {fold}/5...")
    
    # Split data for this fold
    X_fold_train = X_train_scaled.iloc[train_idx] if isinstance(X_train_scaled, pd.DataFrame) else X_train_scaled[train_idx]
    X_fold_val = X_train_scaled.iloc[val_idx] if isinstance(X_train_scaled, pd.DataFrame) else X_train_scaled[val_idx]
    y_fold_train = y_train.iloc[train_idx] if isinstance(y_train, pd.Series) else y_train[train_idx]
    y_fold_val = y_train.iloc[val_idx] if isinstance(y_train, pd.Series) else y_train[val_idx]
    
    # Apply SMOTE to training fold ONLY (not validation)
    smote = SMOTE(random_state=42)
    X_fold_train_resampled, y_fold_train_resampled = smote.fit_resample(X_fold_train, y_fold_train)
    
    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_fold_train_resampled.values if isinstance(X_fold_train_resampled, pd.DataFrame) else X_fold_train_resampled)
    y_train_tensor = torch.FloatTensor(y_fold_train_resampled.values if isinstance(y_fold_train_resampled, pd.Series) else y_fold_train_resampled).reshape(-1, 1)
    X_val_tensor = torch.FloatTensor(X_fold_val.values if isinstance(X_fold_val, pd.DataFrame) else X_fold_val)
    y_val_tensor = torch.FloatTensor(y_fold_val.values if isinstance(y_fold_val, pd.Series) else y_fold_val).reshape(-1, 1)
    
    # Create data loaders
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    
    # Create fresh model for this fold
    model = FraudDetectionNN(X_train_scaled.shape[1], hidden_units=128, dropout=0.3).to(device)
    
    # Train model (no history tracking for CV folds)
    model, fold_auc, stopped_epoch = train_neural_network(
        model, train_loader, val_loader, 
        epochs=20, patience=5, learning_rate=0.001, device=device, track_history=False
    )
    
    cv_scores_smote.append(fold_auc)
    
    print(f"      Fold {fold} AUC: {fold_auc:.4f}")
    print(f"      Stopped at epoch: {stopped_epoch}")

cv_time_smote = time.time() - start_time
cv_scores_smote = np.array(cv_scores_smote)

# Display SMOTE Cross-Validation Results
print("\n" + "-"*70)
print("SMOTE Cross-Validation Results")
print("-"*70)

print(f"\n+ Cross-validation complete in {cv_time_smote:.2f} seconds ({cv_time_smote/60:.1f} minutes)")

print(f"\n Individual Fold Scores (SMOTE):")
for i, score in enumerate(cv_scores_smote, 1):
    print(f"   Fold {i}: AUC = {score:.4f}")

print(f"\n SMOTE Summary Statistics:")
print(f"   Mean CV AUC:  {cv_scores_smote.mean():.4f}")
print(f"   Std CV AUC:   {cv_scores_smote.std():.4f}")
print(f"   Min CV AUC:   {cv_scores_smote.min():.4f}")
print(f"   Max CV AUC:   {cv_scores_smote.max():.4f}")

# Calculate 95% confidence interval
confidence_interval_smote = 1.96 * cv_scores_smote.std()
print(f"\n 95% Confidence Interval:")
print(f"   {cv_scores_smote.mean():.4f} +/- {confidence_interval_smote:.4f}")
print(f"   Range: [{cv_scores_smote.mean() - confidence_interval_smote:.4f}, "
      f"{cv_scores_smote.mean() + confidence_interval_smote:.4f}]")

# ----------------------------------------------------------------------------
# Train Final SMOTE Model WITH HISTORY TRACKING
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Training Final SMOTE Model")
print("-"*70)

print("\n Training final neural network with SMOTE on all training data...")
print("   This model will be used for final evaluation")
print("   Training for up to 50 epochs with early stopping")
print("   Tracking training history for visualization")
print("   This will take 3-5 minutes")

# Apply SMOTE to entire training set
smote_full = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote_full.fit_resample(X_train_scaled, y_train)

# Convert to PyTorch tensors
X_train_smote_tensor = torch.FloatTensor(X_train_smote.values if isinstance(X_train_smote, pd.DataFrame) else X_train_smote)
y_train_smote_tensor = torch.FloatTensor(y_train_smote.values if isinstance(y_train_smote, pd.Series) else y_train_smote).reshape(-1, 1)

# Create dataset and split for validation
train_dataset_full = TensorDataset(X_train_smote_tensor, y_train_smote_tensor)

# Split into train and validation (80/20)
train_size = int(0.8 * len(train_dataset_full))
val_size = len(train_dataset_full) - train_size
train_dataset_split, val_dataset_split = torch.utils.data.random_split(
    train_dataset_full, [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

train_loader_full = DataLoader(train_dataset_split, batch_size=32, shuffle=True)
val_loader_full = DataLoader(val_dataset_split, batch_size=32, shuffle=False)

start_time = time.time()

# Create and train final model WITH HISTORY TRACKING
nn_smote_model = FraudDetectionNN(X_train_scaled.shape[1], hidden_units=128, dropout=0.3).to(device)

nn_smote_model, final_auc, stopped_epoch, smote_training_history = train_neural_network(
    nn_smote_model, train_loader_full, val_loader_full,
    epochs=50, patience=10, learning_rate=0.001, device=device, track_history=True
)

train_time_smote = time.time() - start_time

print(f"\n+ SMOTE model training complete in {train_time_smote:.2f} seconds ({train_time_smote/60:.1f} minutes)")
print(f"   Stopped at epoch: {stopped_epoch}")
print(f"   Training history captured: {len(smote_training_history['epoch'])} epochs")

# Store SMOTE results
nn_smote_cv_mean = cv_scores_smote.mean()
nn_smote_cv_std = cv_scores_smote.std()

print(f"\n SMOTE Model Summary:")
print(f"   Expected AUC: {nn_smote_cv_mean:.4f} +/- {nn_smote_cv_std:.4f}")
print(f"   Stability: {'Excellent' if nn_smote_cv_std < 0.01 else 'Good' if nn_smote_cv_std < 0.02 else 'Moderate'}")


# ============================================================================
# PART B: NEURAL NETWORK WITH ADASYN
# ============================================================================
print("\n" + "="*70)
print("PART B: NEURAL NETWORK + ADASYN")
print("="*70)

print("\n About ADASYN:")
print("   - Adaptive Synthetic Sampling")
print("   - Focuses on harder-to-learn minority samples")
print("   - Creates MORE synthetics near decision boundary (harder cases)")
print("   - Creates FEWER synthetics in easy regions")
print("   - Better for complex, varied fraud patterns")

# ----------------------------------------------------------------------------
# Manual K-Fold Cross-Validation with ADASYN
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Running 5-Fold Cross-Validation with ADASYN")
print("-"*70)

print(f"\n Running manual 5-fold cross-validation...")
print(f"   Dataset: X_train_scaled ({total_samples:,} samples, {total_frauds} frauds)")
print(f"   ADASYN will adaptively generate synthetics based on difficulty")
print(f"   This will take 10-15 minutes (depending on GPU)")

# Initialize K-Fold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores_adasyn = []

start_time = time.time()

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_scaled), 1):
    print(f"\n    Training fold {fold}/5...")
    
    # Split data for this fold
    X_fold_train = X_train_scaled.iloc[train_idx] if isinstance(X_train_scaled, pd.DataFrame) else X_train_scaled[train_idx]
    X_fold_val = X_train_scaled.iloc[val_idx] if isinstance(X_train_scaled, pd.DataFrame) else X_train_scaled[val_idx]
    y_fold_train = y_train.iloc[train_idx] if isinstance(y_train, pd.Series) else y_train[train_idx]
    y_fold_val = y_train.iloc[val_idx] if isinstance(y_train, pd.Series) else y_train[val_idx]
    
    # Apply ADASYN to training fold ONLY (not validation)
    adasyn = ADASYN(random_state=42)
    X_fold_train_resampled, y_fold_train_resampled = adasyn.fit_resample(X_fold_train, y_fold_train)
    
    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_fold_train_resampled.values if isinstance(X_fold_train_resampled, pd.DataFrame) else X_fold_train_resampled)
    y_train_tensor = torch.FloatTensor(y_fold_train_resampled.values if isinstance(y_fold_train_resampled, pd.Series) else y_fold_train_resampled).reshape(-1, 1)
    X_val_tensor = torch.FloatTensor(X_fold_val.values if isinstance(X_fold_val, pd.DataFrame) else X_fold_val)
    y_val_tensor = torch.FloatTensor(y_fold_val.values if isinstance(y_fold_val, pd.Series) else y_fold_val).reshape(-1, 1)
    
    # Create data loaders
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    
    # Create fresh model for this fold
    model = FraudDetectionNN(X_train_scaled.shape[1], hidden_units=128, dropout=0.3).to(device)
    
    # Train model
    model, fold_auc, stopped_epoch = train_neural_network(
        model, train_loader, val_loader,
        epochs=20, patience=5, learning_rate=0.001, device=device, track_history=False
    )
    
    cv_scores_adasyn.append(fold_auc)
    
    print(f"      Fold {fold} AUC: {fold_auc:.4f}")
    print(f"      Stopped at epoch: {stopped_epoch}")

cv_time_adasyn = time.time() - start_time
cv_scores_adasyn = np.array(cv_scores_adasyn)

# Display ADASYN Cross-Validation Results
print("\n" + "-"*70)
print("ADASYN Cross-Validation Results")
print("-"*70)

print(f"\n+ Cross-validation complete in {cv_time_adasyn:.2f} seconds ({cv_time_adasyn/60:.1f} minutes)")

print(f"\n Individual Fold Scores (ADASYN):")
for i, score in enumerate(cv_scores_adasyn, 1):
    print(f"   Fold {i}: AUC = {score:.4f}")

print(f"\n ADASYN Summary Statistics:")
print(f"   Mean CV AUC:  {cv_scores_adasyn.mean():.4f}")
print(f"   Std CV AUC:   {cv_scores_adasyn.std():.4f}")
print(f"   Min CV AUC:   {cv_scores_adasyn.min():.4f}")
print(f"   Max CV AUC:   {cv_scores_adasyn.max():.4f}")

# Calculate 95% confidence interval
confidence_interval_adasyn = 1.96 * cv_scores_adasyn.std()
print(f"\n 95% Confidence Interval:")
print(f"   {cv_scores_adasyn.mean():.4f} +/- {confidence_interval_adasyn:.4f}")
print(f"   Range: [{cv_scores_adasyn.mean() - confidence_interval_adasyn:.4f}, "
      f"{cv_scores_adasyn.mean() + confidence_interval_adasyn:.4f}]")

# ----------------------------------------------------------------------------
# Train Final ADASYN Model WITH HISTORY TRACKING
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Training Final ADASYN Model")
print("-"*70)

print("\n Training final neural network with ADASYN on all training data...")
print("   This model will be used for final evaluation")
print("   Training for up to 50 epochs with early stopping")
print("   Tracking training history for visualization")
print("   This will take 3-5 minutes")

# Apply ADASYN to entire training set
adasyn_full = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn_full.fit_resample(X_train_scaled, y_train)

# Convert to PyTorch tensors
X_train_adasyn_tensor = torch.FloatTensor(X_train_adasyn.values if isinstance(X_train_adasyn, pd.DataFrame) else X_train_adasyn)
y_train_adasyn_tensor = torch.FloatTensor(y_train_adasyn.values if isinstance(y_train_adasyn, pd.Series) else y_train_adasyn).reshape(-1, 1)

# Create dataset and split for validation
train_dataset_full = TensorDataset(X_train_adasyn_tensor, y_train_adasyn_tensor)

# Split into train and validation (80/20)
train_size = int(0.8 * len(train_dataset_full))
val_size = len(train_dataset_full) - train_size
train_dataset_split, val_dataset_split = torch.utils.data.random_split(
    train_dataset_full, [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

train_loader_full = DataLoader(train_dataset_split, batch_size=32, shuffle=True)
val_loader_full = DataLoader(val_dataset_split, batch_size=32, shuffle=False)

start_time = time.time()

# Create and train final model WITH HISTORY TRACKING
nn_adasyn_model = FraudDetectionNN(X_train_scaled.shape[1], hidden_units=128, dropout=0.3).to(device)

nn_adasyn_model, final_auc, stopped_epoch, adasyn_training_history = train_neural_network(
    nn_adasyn_model, train_loader_full, val_loader_full,
    epochs=50, patience=10, learning_rate=0.001, device=device, track_history=True
)

train_time_adasyn = time.time() - start_time

print(f"\n+ ADASYN model training complete in {train_time_adasyn:.2f} seconds ({train_time_adasyn/60:.1f} minutes)")
print(f"   Stopped at epoch: {stopped_epoch}")
print(f"   Training history captured: {len(adasyn_training_history['epoch'])} epochs")

# Store ADASYN results
nn_adasyn_cv_mean = cv_scores_adasyn.mean()
nn_adasyn_cv_std = cv_scores_adasyn.std()

print(f"\n ADASYN Model Summary:")
print(f"   Expected AUC: {nn_adasyn_cv_mean:.4f} +/- {nn_adasyn_cv_std:.4f}")
print(f"   Stability: {'Excellent' if nn_adasyn_cv_std < 0.01 else 'Good' if nn_adasyn_cv_std < 0.02 else 'Moderate'}")


# ============================================================================
# COMPARISON: SMOTE vs ADASYN
# ============================================================================
print("\n" + "="*70)
print("SMOTE vs ADASYN COMPARISON")
print("="*70)

print(f"\n Performance Comparison:")
print(f"   SMOTE  Mean AUC: {nn_smote_cv_mean:.4f} (+/-{nn_smote_cv_std:.4f})")
print(f"   ADASYN Mean AUC: {nn_adasyn_cv_mean:.4f} (+/-{nn_adasyn_cv_std:.4f})")

# Calculate difference
auc_difference = nn_adasyn_cv_mean - nn_smote_cv_mean
print(f"\n   Difference: {abs(auc_difference):.4f} ({'ADASYN better' if auc_difference > 0 else 'SMOTE better'})")

# Determine winner
if abs(auc_difference) < 0.001:
    winner = "TIE - Performance essentially identical"
    recommendation = "Either technique is suitable - choose based on training time"
elif auc_difference > 0:
    winner = "ADASYN WINS"
    recommendation = "ADASYN provides better fraud detection for this dataset"
else:
    winner = "SMOTE WINS"
    recommendation = "SMOTE provides better fraud detection for this dataset"

print(f"\n Winner: {winner}")
print(f"\n Recommendation: {recommendation}")

print(f"\n Training Time Comparison:")
print(f"   SMOTE:  {cv_time_smote:.2f} seconds ({cv_time_smote/60:.1f} minutes)")
print(f"   ADASYN: {cv_time_adasyn:.2f} seconds ({cv_time_adasyn/60:.1f} minutes)")
print(f"   Difference: {abs(cv_time_smote - cv_time_adasyn):.2f} seconds ({'ADASYN slower' if cv_time_adasyn > cv_time_smote else 'SMOTE slower'})")

print(f"\n Stability Comparison:")
print(f"   SMOTE  Std: {nn_smote_cv_std:.4f} ({'More stable' if nn_smote_cv_std < nn_adasyn_cv_std else 'Less stable'})")
print(f"   ADASYN Std: {nn_adasyn_cv_std:.4f} ({'More stable' if nn_adasyn_cv_std < nn_smote_cv_std else 'Less stable'})")

# Statistical significance test
print(f"\n Statistical Significance:")
smote_lower = nn_smote_cv_mean - confidence_interval_smote
smote_upper = nn_smote_cv_mean + confidence_interval_smote
adasyn_lower = nn_adasyn_cv_mean - confidence_interval_adasyn
adasyn_upper = nn_adasyn_cv_mean + confidence_interval_adasyn

if (smote_lower <= adasyn_upper) and (adasyn_lower <= smote_upper):
    print(f"   Confidence intervals overlap")
    print(f"   Difference may not be statistically significant")
    print(f"   Both methods perform similarly on this dataset")
else:
    print(f"   + Confidence intervals don't overlap")
    print(f"   Difference is likely statistically significant")
    print(f"   {winner} is meaningfully better")

# Visualization of comparison
print("\n" + "-"*70)
print("Visual Comparison")
print("-"*70)

print("\n Mean AUC Scores:")
print(f"   SMOTE:  {'█' * int(nn_smote_cv_mean * 50)} {nn_smote_cv_mean:.4f}")
print(f"   ADASYN: {'█' * int(nn_adasyn_cv_mean * 50)} {nn_adasyn_cv_mean:.4f}")

print(f"\n Consistency (Lower is Better):")
print(f"   SMOTE:  {'█' * int(nn_smote_cv_std * 1000)} {nn_smote_cv_std:.4f}")
print(f"   ADASYN: {'█' * int(nn_adasyn_cv_std * 1000)} {nn_adasyn_cv_std:.4f}")


# ============================================================================
# SELECT BEST MODEL FOR FINAL USE
# ============================================================================
print("\n" + "="*70)
print("SELECTING BEST RESAMPLING TECHNIQUE")
print("="*70)

# Choose best based on mean AUC (with tie-breaker on stability)
if abs(auc_difference) < 0.001:
    # Performance is essentially the same, choose more stable
    if nn_smote_cv_std < nn_adasyn_cv_std:
        best_nn_model = nn_smote_model
        best_method = "SMOTE"
        best_cv_mean = nn_smote_cv_mean
        best_cv_std = nn_smote_cv_std
        best_cv_scores = cv_scores_smote
        best_training_history = smote_training_history
        reason = "Similar performance, but SMOTE is more stable"
    else:
        best_nn_model = nn_adasyn_model
        best_method = "ADASYN"
        best_cv_mean = nn_adasyn_cv_mean
        best_cv_std = nn_adasyn_cv_std
        best_cv_scores = cv_scores_adasyn
        best_training_history = adasyn_training_history
        reason = "Similar performance, but ADASYN is more stable"
else:
    # Clear winner based on AUC
    if nn_smote_cv_mean > nn_adasyn_cv_mean:
        best_nn_model = nn_smote_model
        best_method = "SMOTE"
        best_cv_mean = nn_smote_cv_mean
        best_cv_std = nn_smote_cv_std
        best_cv_scores = cv_scores_smote
        best_training_history = smote_training_history
        reason = f"Higher mean AUC ({nn_smote_cv_mean:.4f} vs {nn_adasyn_cv_mean:.4f})"
    else:
        best_nn_model = nn_adasyn_model
        best_method = "ADASYN"
        best_cv_mean = nn_adasyn_cv_mean
        best_cv_std = nn_adasyn_cv_std
        best_cv_scores = cv_scores_adasyn
        best_training_history = adasyn_training_history
        reason = f"Higher mean AUC ({nn_adasyn_cv_mean:.4f} vs {nn_smote_cv_mean:.4f})"

print(f"\n+ Selected Method: {best_method}")
print(f"   Reason: {reason}")
print(f"   Performance: {best_cv_mean:.4f} +/- {best_cv_std:.4f}")

print(f"\n Storing best model for later steps:")
print(f"   Model: Neural Network + {best_method}")
print(f"   Expected AUC: {best_cv_mean:.4f}")

# Store the best results for comparison in Step 21
nn_final_pipeline = best_nn_model
nn_cv_mean = best_cv_mean
nn_cv_std = best_cv_std
nn_cv_scores = best_cv_scores
nn_best_method = best_method


# ============================================================================
# HYPERPARAMETER TUNING FOR BEST MODEL
# ============================================================================
print("\n" + "="*70)
print("HYPERPARAMETER TUNING - NEURAL NETWORK")
print("="*70)

print(f"\n WHAT WE'RE DOING:")
print(f"   - Fine-tuning the best model ({best_method} + Neural Network)")
print(f"   - Testing different hyperparameter combinations")
print(f"   - Using manual grid search with 5-fold cross-validation")
print(f"   - Goal: Improve beyond baseline {best_cv_mean:.4f} AUC")

# ----------------------------------------------------------------------------
# Define Hyperparameter Grid
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Defining Hyperparameter Search Space")
print("-"*70)

print("\n Hyperparameters to tune:")
print("   1. learning_rate (Learning speed)")
print("      - How fast the model adjusts weights")
print("      - 0.001: Slow, careful learning (more precise)")
print("      - 0.01: Fast learning (quicker but might overshoot)")
print("\n   2. dropout (Regularization strength)")
print("      - Percentage of neurons randomly dropped during training")
print("      - 0.2: Less aggressive (20% dropout)")
print("      - 0.3: More aggressive (30% dropout, current baseline)")
print("\n   3. batch_size (Training batch size)")
print("      - Number of samples processed before updating weights")
print("      - 32: Smaller batches, noisier updates (current)")
print("      - 64: Larger batches, more stable updates")
print("\n   4. hidden_units (Network capacity)")
print("      - Number of neurons in first layer")
print("      - 64: Smaller network (faster, less overfitting)")
print("      - 128: Larger network (more capacity, current baseline)")

# Define parameter grid
param_grid = {
    'learning_rate': [0.001, 0.01],
    'dropout': [0.2, 0.3],
    'batch_size': [32, 64],
    'hidden_units': [64, 128]
}

print("\n Parameter Grid:")
print(f"   learning_rate: {param_grid['learning_rate']}")
print(f"   dropout: {param_grid['dropout']}")
print(f"   batch_size: {param_grid['batch_size']}")
print(f"   hidden_units: {param_grid['hidden_units']}")
print(f"\n   Total combinations: {len(param_grid['learning_rate']) * len(param_grid['dropout']) * len(param_grid['batch_size']) * len(param_grid['hidden_units'])}")
print(f"   With 5-fold CV: {len(param_grid['learning_rate']) * len(param_grid['dropout']) * len(param_grid['batch_size']) * len(param_grid['hidden_units']) * 5} model fits")

# ----------------------------------------------------------------------------
# Run Manual Grid Search
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Running Grid Search with Cross-Validation")
print("-"*70)

print("\n This will take 60-80 minutes...")
print("   Each combination is tested with 5-fold CV")
print("   Neural networks train sequentially (cannot parallelize on Windows)")
print("   Progress will be shown below")

# Choose best resampling method for tuning
if best_method == "SMOTE":
    resampler_class = SMOTE
else:
    resampler_class = ADASYN

# Store all results
all_results = []

start_time = time.time()

# Grid search loop
combination_num = 0
total_combinations = len(param_grid['learning_rate']) * len(param_grid['dropout']) * len(param_grid['batch_size']) * len(param_grid['hidden_units'])

for lr in param_grid['learning_rate']:
    for dropout in param_grid['dropout']:
        for batch_size in param_grid['batch_size']:
            for hidden_units in param_grid['hidden_units']:
                combination_num += 1
                
                print(f"\n  Testing combination {combination_num}/{total_combinations}:")
                print(f"    learning_rate={lr}, dropout={dropout}, batch_size={batch_size}, hidden_units={hidden_units}")
                
                # Run 5-fold CV for this combination
                kfold = KFold(n_splits=5, shuffle=True, random_state=42)
                cv_scores = []
                
                for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_scaled), 1):
                    # Split data
                    X_fold_train = X_train_scaled.iloc[train_idx] if isinstance(X_train_scaled, pd.DataFrame) else X_train_scaled[train_idx]
                    X_fold_val = X_train_scaled.iloc[val_idx] if isinstance(X_train_scaled, pd.DataFrame) else X_train_scaled[val_idx]
                    y_fold_train = y_train.iloc[train_idx] if isinstance(y_train, pd.Series) else y_train[train_idx]
                    y_fold_val = y_train.iloc[val_idx] if isinstance(y_train, pd.Series) else y_train[val_idx]
                    
                    # Apply resampling
                    resampler = resampler_class(random_state=42)
                    X_fold_train_resampled, y_fold_train_resampled = resampler.fit_resample(X_fold_train, y_fold_train)
                    
                    # Convert to tensors
                    X_train_tensor = torch.FloatTensor(X_fold_train_resampled.values if isinstance(X_fold_train_resampled, pd.DataFrame) else X_fold_train_resampled)
                    y_train_tensor = torch.FloatTensor(y_fold_train_resampled.values if isinstance(y_fold_train_resampled, pd.Series) else y_fold_train_resampled).reshape(-1, 1)
                    X_val_tensor = torch.FloatTensor(X_fold_val.values if isinstance(X_fold_val, pd.DataFrame) else X_fold_val)
                    y_val_tensor = torch.FloatTensor(y_fold_val.values if isinstance(y_fold_val, pd.Series) else y_fold_val).reshape(-1, 1)
                    
                    # Create data loaders with current batch_size
                    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
                    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
                    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
                    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
                    
                    # Create and train model
                    model = FraudDetectionNN(X_train_scaled.shape[1], hidden_units=hidden_units, dropout=dropout).to(device)
                    model, fold_auc, stopped_epoch = train_neural_network(
                        model, train_loader, val_loader,
                        epochs=20, patience=5, learning_rate=lr, device=device, track_history=False
                    )
                    
                    cv_scores.append(fold_auc)
                
                # Calculate mean and std for this combination
                mean_auc = np.mean(cv_scores)
                std_auc = np.std(cv_scores)
                
                # Store results
                all_results.append({
                    'learning_rate': lr,
                    'dropout': dropout,
                    'batch_size': batch_size,
                    'hidden_units': hidden_units,
                    'mean_auc': mean_auc,
                    'std_auc': std_auc,
                    'cv_scores': cv_scores
                })
                
                print(f"    Mean AUC: {mean_auc:.4f} (+/-{std_auc:.4f})")

grid_time = time.time() - start_time

# ----------------------------------------------------------------------------
# Display Results
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Grid Search Results")
print("-"*70)

print(f"\n+ Grid search complete in {grid_time:.2f} seconds ({grid_time/60:.1f} minutes)")

# Find best combination
all_results_sorted = sorted(all_results, key=lambda x: x['mean_auc'], reverse=True)
best_params = all_results_sorted[0]

print(f"\n Best Parameters Found:")
print(f"   learning_rate: {best_params['learning_rate']}")
print(f"   dropout: {best_params['dropout']}")
print(f"   batch_size: {best_params['batch_size']}")
print(f"   hidden_units: {best_params['hidden_units']}")

print(f"\n Performance Comparison:")
print(f"   Baseline AUC:  {best_cv_mean:.4f} (+/-{best_cv_std:.4f})")
print(f"   Tuned AUC:     {best_params['mean_auc']:.4f} (+/-{best_params['std_auc']:.4f})")

# Calculate improvement
improvement = best_params['mean_auc'] - best_cv_mean
improvement_pct = (improvement / best_cv_mean) * 100

print(f"\n Improvement:")
if improvement > 0:
    print(f"   +{improvement:.4f} AUC ({improvement_pct:+.2f}%)")
    print(f"   + Hyperparameter tuning improved performance!")
elif improvement < -0.001:
    print(f"   {improvement:.4f} AUC ({improvement_pct:.2f}%)")
    print(f"   Tuned model performed slightly worse")
    print(f"   -> Baseline hyperparameters were already well-optimized")
else:
    print(f"   ~{improvement:.4f} AUC (essentially no change)")
    print(f"   -> Baseline hyperparameters were already optimal")

# ----------------------------------------------------------------------------
# Top 5 Parameter Combinations
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Top 5 Hyperparameter Combinations")
print("-"*70)

print("\n Best performing combinations:\n")
for i, result in enumerate(all_results_sorted[:5], 1):
    print(f"   Rank {i}:")
    print(f"      learning_rate={result['learning_rate']}, dropout={result['dropout']}, "
          f"batch_size={result['batch_size']}, hidden_units={result['hidden_units']}")
    print(f"      Mean AUC: {result['mean_auc']:.4f} (+/-{result['std_auc']:.4f})")
    print()

# ----------------------------------------------------------------------------
# Train Final Tuned Model WITH HISTORY TRACKING
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Training Final Tuned Model")
print("-"*70)

print("\n Training final neural network with best hyperparameters...")
print("   Tracking training history for visualization")
print("   This will take 3-5 minutes")

# Apply resampling to full training set
if best_method == "SMOTE":
    resampler_full = SMOTE(random_state=42)
else:
    resampler_full = ADASYN(random_state=42)

X_train_resampled_full, y_train_resampled_full = resampler_full.fit_resample(X_train_scaled, y_train)

# Convert to tensors
X_train_full_tensor = torch.FloatTensor(X_train_resampled_full.values if isinstance(X_train_resampled_full, pd.DataFrame) else X_train_resampled_full)
y_train_full_tensor = torch.FloatTensor(y_train_resampled_full.values if isinstance(y_train_resampled_full, pd.Series) else y_train_resampled_full).reshape(-1, 1)

# Create dataset and split
train_dataset_full = TensorDataset(X_train_full_tensor, y_train_full_tensor)
train_size = int(0.8 * len(train_dataset_full))
val_size = len(train_dataset_full) - train_size
train_dataset_split, val_dataset_split = torch.utils.data.random_split(
    train_dataset_full, [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

train_loader_tuned = DataLoader(train_dataset_split, batch_size=best_params['batch_size'], shuffle=True)
val_loader_tuned = DataLoader(val_dataset_split, batch_size=best_params['batch_size'], shuffle=False)

start_time = time.time()

# Create and train final tuned model WITH HISTORY TRACKING
nn_tuned_model = FraudDetectionNN(
    X_train_scaled.shape[1],
    hidden_units=best_params['hidden_units'],
    dropout=best_params['dropout']
).to(device)

nn_tuned_model, final_auc, stopped_epoch, tuned_training_history = train_neural_network(
    nn_tuned_model, train_loader_tuned, val_loader_tuned,
    epochs=50, patience=10, learning_rate=best_params['learning_rate'], device=device, track_history=True
)

final_train_time = time.time() - start_time

print(f"\n+ Tuned model training complete in {final_train_time:.2f} seconds ({final_train_time/60:.1f} minutes)")
print(f"   Stopped at epoch: {stopped_epoch}")
print(f"   Training history captured: {len(tuned_training_history['epoch'])} epochs")

# Store tuned results
nn_tuned_cv_mean = best_params['mean_auc']
nn_tuned_cv_std = best_params['std_auc']
nn_best_params = {
    'learning_rate': best_params['learning_rate'],
    'dropout': best_params['dropout'],
    'batch_size': best_params['batch_size'],
    'hidden_units': best_params['hidden_units']
}

print(f"\n+ Tuned model stored for final evaluation")
print(f"   Best hyperparameters saved")
print(f"   Expected AUC: {nn_tuned_cv_mean:.4f}")

# ----------------------------------------------------------------------------
# Decision: Use Baseline or Tuned?
# ----------------------------------------------------------------------------
print("\n" + "-"*70)
print("Model Selection Decision")
print("-"*70)

# Decide whether to use tuned or baseline model
if improvement > 0.0001:  # Meaningful improvement
    print(f"\n+ Using TUNED model for final evaluation")
    print(f"   Reason: Tuning improved performance by {improvement:.4f}")
    nn_final_model = nn_tuned_model
    nn_final_cv_mean = nn_tuned_cv_mean
    nn_final_cv_std = nn_tuned_cv_std
    nn_final_training_history = tuned_training_history
    model_version = "Tuned"
else:
    print(f"\n+ Using BASELINE model for final evaluation")
    print(f"   Reason: Tuning did not provide meaningful improvement")
    print(f"   Baseline hyperparameters were already well-optimized")
    nn_final_model = nn_final_pipeline
    nn_final_cv_mean = best_cv_mean
    nn_final_cv_std = best_cv_std
    nn_final_training_history = best_training_history
    model_version = "Baseline"

print(f"\n Final Neural Network Model:")
print(f"   Version: {model_version}")
print(f"   Resampling: {best_method}")
print(f"   Expected AUC: {nn_final_cv_mean:.4f} (+/-{nn_final_cv_std:.4f})")


# ============================================================================
# SAVE MODEL AND RESULTS TO DISK
# ============================================================================
print("\n" + "="*70)
print("SAVING MODEL AND RESULTS")
print("="*70)

print("\n Saving to ../models/ directory...")

# Save the final model (PyTorch uses .pth extension)
model_path = '../models/neural_network_model.pth'
torch.save(nn_final_model.state_dict(), model_path)
print(f"   + Model saved: {model_path}")

# Save all results and metadata INCLUDING TRAINING HISTORY
results_data = {
    'model_name': 'Neural Network',
    'resampling_method': best_method,
    'model_version': model_version,
    'cv_mean': nn_final_cv_mean,
    'cv_std': nn_final_cv_std,
    'cv_scores': nn_cv_scores,
    'best_params': nn_best_params if model_version == "Tuned" else None,
    'training_samples': total_samples,
    'training_frauds': total_frauds,
    'architecture': {
        'input_dim': X_train_scaled.shape[1],
        'hidden_units': best_params['hidden_units'] if model_version == "Tuned" else 128,
        'dropout': best_params['dropout'] if model_version == "Tuned" else 0.3
    },
    'training_history': nn_final_training_history,  # ADDED: Training history
    'timestamp': pd.Timestamp.now()
}

results_path = '../models/neural_network_results.pkl'
with open(results_path, 'wb') as f:
    pickle.dump(results_data, f)
print(f"   + Results saved: {results_path}")

print(f"\n+ All files saved successfully!")
print(f"   Model can now be loaded in Phase 5 for comparison")
print(f"   Training history available for Step 22 visualization")


# ============================================================================
# FINAL SUMMARY
# ============================================================================
print("\n" + "="*70)
print("STEP 20 COMPLETE - KEY ACCOMPLISHMENTS")
print("="*70)

print("\n What We Accomplished:")
print("   + Built 4-layer Neural Network (deep learning)")
print("   + Compared SMOTE vs ADASYN performance")
print("   + Used manual 5-Fold CV (PyTorch requirement)")
print("   + Resampling applied inside each fold (proper methodology)")
print("   + Validated on 100% real data (no synthetic contamination)")
print("   + Leveraged GPU acceleration for faster training")
print("   + Selected best resampling technique based on performance")
print("   + Performed hyperparameter tuning with grid search")
print("   + Trained final optimized model on all training data")
print("   + SAVED model and results to disk")
print("   + CAPTURED training history for visualization")

print(f"\n Final Results:")
print(f"   Best Method: {best_method}")
print(f"   Model Version: {model_version}")
print(f"   Cross-Validation AUC: {nn_final_cv_mean:.4f} (+/-{nn_final_cv_std:.4f})")
print(f"   Model Stability: {'Excellent' if nn_final_cv_std < 0.01 else 'Good' if nn_final_cv_std < 0.02 else 'Moderate'}")
if torch.cuda.is_available():
    print(f"   GPU Acceleration: + Used successfully")
else:
    print(f"   GPU Acceleration: - Not available (CPU training)")

print("\n Saved Files:")
print(f"   - {model_path}")
print(f"   - {results_path}")

print("\n Next Steps:")
print("   -> Step 21: Compare all models and select winner")
print("   -> Step 22: Visualize neural network training curves")
print("   -> Final evaluation on test set")

print("\n Important Notes:")
print("   - Test set remains UNTOUCHED for final evaluation in Phase 5")
print("   - Model saved and can be loaded anytime (no need to retrain)")
print("   - Can restart PC without losing progress")
print(f"   - {best_method} + {model_version} model ready for Phase 5")
print(f"   - All 4 models trained with consistent methodology for fair comparison")
print(f"   - Training history saved for Step 22 visualization")

print("\n" + "="*70)


MODEL 4: NEURAL NETWORK (DEEP LEARNING)

 WHAT WE'RE DOING:
   - Building Neural Network classifier (deep learning)
   - Testing TWO resampling techniques: SMOTE vs ADASYN
   - Using manual 5-fold cross-validation (PyTorch requirement)
   - Applying resampling INSIDE each CV fold (proper methodology)
   - Validating on REAL data only (no synthetic frauds in validation)
   - Leveraging GPU acceleration for faster training
   - SAVING model and results to disk for later use
   - TRACKING training history for visualization

ABOUT NEURAL NETWORKS

 What is a Neural Network?
   - Deep learning model inspired by human brain
   - Multiple layers of interconnected 'neurons'
   - Each layer learns increasingly complex patterns
   - Can discover non-linear relationships in data
   - State-of-the-art for many complex pattern recognition tasks

 How Neural Networks Differ from Other Models:
   Logistic Regression:
      - Linear decision boundary (straight line)
      - Fast and interpretable
   