In [3]:
# %% [markdown]
# # Task 2: Model Building and Training for Fraud Detection
# 
# **Objective**: Build, train, and evaluate classification models to detect fraudulent transactions, 
# using appropriate techniques for imbalanced data.

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine learning imports
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (confusion_matrix, classification_report, 
                           roc_auc_score, average_precision_score, 
                           precision_recall_curve, roc_curve, f1_score,
                           precision_score, recall_score, accuracy_score)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
import joblib
import time

# Set style for visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# %%
# ============================================
# PART 1: E-COMMERCE FRAUD MODELING
# ============================================

print("="*60)
print("PART 1: E-COMMERCE FRAUD DETECTION MODELING")
print("="*60)

# %%
# Load feature-engineered e-commerce data
print("\n1. Loading feature-engineered e-commerce data...")
ecom_data = pd.read_csv('../data/processed/fraud_data_featured.csv')
print(f"Dataset shape: {ecom_data.shape}")
print(f"Columns: {len(ecom_data.columns)}")
print(f"Fraud rate: {ecom_data['class'].mean()*100:.2f}%")

# Check for missing values
print(f"\nMissing values in dataset: {ecom_data.isnull().sum().sum()}")
if ecom_data.isnull().sum().sum() > 0:
    print("\nColumns with missing values:")
    missing_info = ecom_data.isnull().sum()
    missing_info = missing_info[missing_info > 0]
    for col, count in missing_info.items():
        missing_pct = count / len(ecom_data) * 100
        print(f"  {col}: {count} missing ({missing_pct:.2f}%)")

# %%
# Data preparation for e-commerce - SIMPLIFIED APPROACH
print("\n2. Preparing e-commerce data for modeling...")

# Separate features and target
X_ecom = ecom_data.drop('class', axis=1)
y_ecom = ecom_data['class']

print(f"Feature matrix shape: {X_ecom.shape}")
print(f"Target shape: {y_ecom.shape}")

# SIMPLE MISSING VALUE HANDLING
print("\n3. Handling missing values with simple approach...")

# Strategy: Remove columns with more than 50% missing values, fill others
missing_threshold = 0.5  # 50% threshold

columns_to_keep = []
for col in X_ecom.columns:
    missing_pct = X_ecom[col].isnull().sum() / len(X_ecom)
    if missing_pct < missing_threshold:
        columns_to_keep.append(col)
    else:
        print(f"  Dropping column '{col}' - {missing_pct*100:.1f}% missing")

X_ecom = X_ecom[columns_to_keep]

# Fill remaining missing values with column median
for col in X_ecom.columns:
    if X_ecom[col].isnull().any():
        X_ecom[col] = X_ecom[col].fillna(X_ecom[col].median())

print(f"\nAfter cleaning: {X_ecom.shape}")
print(f"Remaining missing values: {X_ecom.isnull().sum().sum()}")

# %%
# Convert boolean columns to integer
print("\n4. Converting boolean columns to integer...")
bool_cols = X_ecom.select_dtypes(include=['bool']).columns.tolist()
if bool_cols:
    print(f"Converting {len(bool_cols)} boolean columns to integer")
    X_ecom[bool_cols] = X_ecom[bool_cols].astype(int)

# Ensure all columns are numeric
for col in X_ecom.columns:
    if X_ecom[col].dtype == 'object':
        print(f"  Converting object column '{col}' to numeric...")
        X_ecom[col] = pd.to_numeric(X_ecom[col], errors='coerce')
        X_ecom[col] = X_ecom[col].fillna(X_ecom[col].median())

print(f"Final feature matrix shape: {X_ecom.shape}")

# %%
# Train-test split with stratification
print("\n5. Creating train-test split (80-20 stratified)...")
X_train_ecom, X_test_ecom, y_train_ecom, y_test_ecom = train_test_split(
    X_ecom, y_ecom, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_ecom
)

print(f"Training set: {X_train_ecom.shape[0]:,} samples")
print(f"Test set: {X_test_ecom.shape[0]:,} samples")
print(f"Training fraud rate: {y_train_ecom.mean()*100:.2f}%")
print(f"Test fraud rate: {y_test_ecom.mean()*100:.2f}%")

# %%
# Handle class imbalance using SMOTE (for e-commerce data)
print("\n6. Applying SMOTE to handle class imbalance...")

# Final check for NaN values
print(f"NaN values in training features: {np.isnan(X_train_ecom.values).sum()}")
print(f"NaN values in training target: {y_train_ecom.isnull().sum()}")

# Ensure no NaN values remain
if np.isnan(X_train_ecom.values).sum() > 0:
    print("Filling remaining NaN values...")
    X_train_ecom = X_train_ecom.fillna(X_train_ecom.median())

# Apply SMOTE only to training data
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train_ecom, y_train_ecom)

print(f"Before SMOTE - Training shape: {X_train_ecom.shape}")
print(f"After SMOTE - Training shape: {X_train_smote.shape}")
print(f"Before SMOTE - Fraud rate: {y_train_ecom.mean()*100:.2f}%")
print(f"After SMOTE - Fraud rate: {y_train_smote.mean()*100:.2f}%")

# Save the SMOTE object for later use
joblib.dump(smote, '../models/ecommerce_smote.pkl')
print("SMOTE object saved to '../models/ecommerce_smote.pkl'")

# %%
# Scale features (important for Logistic Regression)
print("\n7. Scaling features...")
scaler_ecom = StandardScaler()
X_train_scaled = scaler_ecom.fit_transform(X_train_smote)
X_test_scaled = scaler_ecom.transform(X_test_ecom)

# Save the scaler
joblib.dump(scaler_ecom, '../models/ecommerce_scaler.pkl')
print("Scaler saved to '../models/ecommerce_scaler.pkl'")

# %%
# Define evaluation function
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name="Model"):
    """
    Comprehensive model evaluation function
    """
    start_time = time.time()
    
    # Train model
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    metrics = {
        'Model': model_name,
        'Training_Time': round(training_time, 2),
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred),
        'F1_Score': f1_score(y_test, y_pred, zero_division=0),
        'ROC_AUC': roc_auc_score(y_test, y_pred_proba),
        'PR_AUC': average_precision_score(y_test, y_pred_proba)
    }
    
    # Additional metrics for business context
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    metrics.update({
        'True_Negative': tn,
        'False_Positive': fp,
        'False_Negative': fn,
        'True_Positive': tp,
        'False_Positive_Rate': fp / (fp + tn) if (fp + tn) > 0 else 0,
        'False_Negative_Rate': fn / (fn + tp) if (fn + tp) > 0 else 0,
        'Business_Cost_Score': (fn * 100 + fp * 10) / len(y_test)  # Simplified cost model
    })
    
    return metrics, model, y_pred, y_pred_proba, cm

# %%
# Baseline Model 1: Logistic Regression
print("\n" + "="*60)
print("MODEL 1: LOGISTIC REGRESSION (Baseline)")
print("="*60)

lr_model = LogisticRegression(
    random_state=42,
    max_iter=1000,
    class_weight='balanced',
    solver='liblinear',
    C=1.0
)

lr_metrics, lr_model_fitted, y_pred_lr, y_pred_proba_lr, cm_lr = evaluate_model(
    lr_model, X_train_scaled, X_test_scaled, y_train_smote, y_test_ecom, 
    "Logistic Regression"
)

print("\nLogistic Regression Performance:")
for key, value in lr_metrics.items():
    if key != 'Model':
        if isinstance(value, float):
            print(f"  {key}: {value:.4f}")
        else:
            print(f"  {key}: {value}")

# %%
# Model 2: Random Forest
print("\n" + "="*60)
print("MODEL 2: RANDOM FOREST")
print("="*60)

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf_metrics, rf_model_fitted, y_pred_rf, y_pred_proba_rf, cm_rf = evaluate_model(
    rf_model, X_train_smote, X_test_ecom, y_train_smote, y_test_ecom, 
    "Random Forest"
)

print("\nRandom Forest Performance:")
for key, value in rf_metrics.items():
    if key != 'Model':
        if isinstance(value, float):
            print(f"  {key}: {value:.4f}")
        else:
            print(f"  {key}: {value}")

# %%
# Model 3: XGBoost
print("\n" + "="*60)
print("MODEL 3: XGBOOST")
print("="*60)

# Calculate scale_pos_weight for imbalance
scale_pos_weight = len(y_train_ecom[y_train_ecom==0]) / len(y_train_ecom[y_train_ecom==1])

xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False,
    n_jobs=-1
)

xgb_metrics, xgb_model_fitted, y_pred_xgb, y_pred_proba_xgb, cm_xgb = evaluate_model(
    xgb_model, X_train_smote, X_test_ecom, y_train_smote, y_test_ecom, 
    "XGBoost"
)

print("\nXGBoost Performance:")
for key, value in xgb_metrics.items():
    if key != 'Model':
        if isinstance(value, float):
            print(f"  {key}: {value:.4f}")
        else:
            print(f"  {key}: {value}")

# %%
# Model 4: LightGBM
print("\n" + "="*60)
print("MODEL 4: LIGHTGBM")
print("="*60)

lgbm_model = LGBMClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    random_state=42,
    verbose=-1,
    n_jobs=-1
)

lgbm_metrics, lgbm_model_fitted, y_pred_lgbm, y_pred_proba_lgbm, cm_lgbm = evaluate_model(
    lgbm_model, X_train_smote, X_test_ecom, y_train_smote, y_test_ecom, 
    "LightGBM"
)

print("\nLightGBM Performance:")
for key, value in lgbm_metrics.items():
    if key != 'Model':
        if isinstance(value, float):
            print(f"  {key}: {value:.4f}")
        else:
            print(f"  {key}: {value}")

# %%
# Compare all e-commerce models
print("\n" + "="*60)
print("E-COMMERCE MODEL COMPARISON")
print("="*60)

# Collect all metrics
ecom_metrics_list = [lr_metrics, rf_metrics, xgb_metrics, lgbm_metrics]
ecom_metrics_df = pd.DataFrame(ecom_metrics_list)

# Set display options
pd.set_option('display.float_format', '{:.4f}'.format)

print("\nPerformance Comparison Table:")
comparison_cols = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1_Score', 'ROC_AUC', 'PR_AUC', 
                   'False_Positive_Rate', 'False_Negative_Rate', 'Business_Cost_Score']
print(ecom_metrics_df[comparison_cols].to_string(index=False))

# %%
# Save e-commerce model comparison
ecom_metrics_df.to_csv('../data/processed/ecommerce_model_comparison.csv', index=False)
print("\nModel comparison saved to '../data/processed/ecommerce_model_comparison.csv'")

# %%
# Cross-validation for the best model
print("\n" + "="*60)
print("CROSS-VALIDATION FOR BEST E-COMMERCE MODEL")
print("="*60)

# Select best model based on PR-AUC
best_ecom_model_name = ecom_metrics_df.loc[ecom_metrics_df['PR_AUC'].idxmax(), 'Model']
print(f"Best model based on PR-AUC: {best_ecom_model_name}")

# Get the best model
if best_ecom_model_name == "Logistic Regression":
    best_ecom_model = lr_model_fitted
    X_cv = X_train_scaled
elif best_ecom_model_name == "Random Forest":
    best_ecom_model = rf_model_fitted
    X_cv = X_train_smote
elif best_ecom_model_name == "XGBoost":
    best_ecom_model = xgb_model_fitted
    X_cv = X_train_smote
else:
    best_ecom_model = lgbm_model_fitted
    X_cv = X_train_smote

# Perform Stratified K-Fold Cross Validation
print(f"\nPerforming 5-fold cross-validation for {best_ecom_model_name}...")

# Use the training data for CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(
    best_ecom_model, X_cv, y_train_smote, 
    cv=skf, 
    scoring='average_precision',
    n_jobs=-1
)

print(f"Cross-validation PR-AUC scores: {cv_scores}")
print(f"Mean PR-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

# %%
# Save the best e-commerce model
print(f"\nSaving best e-commerce model: {best_ecom_model_name}")
model_filename = f'../models/best_ecommerce_model_{best_ecom_model_name.replace(" ", "_")}.pkl'
joblib.dump(best_ecom_model, model_filename)
print(f"Model saved to '{model_filename}'")

# Save feature names
joblib.dump(X_ecom.columns.tolist(), '../models/ecommerce_feature_names.pkl')
print("Feature names saved to '../models/ecommerce_feature_names.pkl'")

# %%
print("\n" + "="*60)
print("E-COMMERCE MODELING COMPLETE!")
print("="*60)
print(f"\nBest model: {best_ecom_model_name}")
print(f"PR-AUC: {ecom_metrics_df.loc[ecom_metrics_df['Model'] == best_ecom_model_name, 'PR_AUC'].values[0]:.4f}")
print(f"Recall: {ecom_metrics_df.loc[ecom_metrics_df['Model'] == best_ecom_model_name, 'Recall'].values[0]:.4f}")
print(f"False Positive Rate: {ecom_metrics_df.loc[ecom_metrics_df['Model'] == best_ecom_model_name, 'False_Positive_Rate'].values[0]:.4f}")

PART 1: E-COMMERCE FRAUD DETECTION MODELING

1. Loading feature-engineered e-commerce data...
Dataset shape: (151112, 50)
Columns: 50
Fraud rate: 9.36%

Missing values in dataset: 151112

Columns with missing values:
  user_std_amount: 151112 missing (100.00%)

2. Preparing e-commerce data for modeling...
Feature matrix shape: (151112, 49)
Target shape: (151112,)

3. Handling missing values with simple approach...
  Dropping column 'user_std_amount' - 100.0% missing

After cleaning: (151112, 48)
Remaining missing values: 0

4. Converting boolean columns to integer...
Converting 4 boolean columns to integer
Final feature matrix shape: (151112, 48)

5. Creating train-test split (80-20 stratified)...
Training set: 120,889 samples
Test set: 30,223 samples
Training fraud rate: 9.36%
Test fraud rate: 9.36%

6. Applying SMOTE to handle class imbalance...
NaN values in training features: 0
NaN values in training target: 0
Before SMOTE - Training shape: (120889, 48)
After SMOTE - Training shape