# Fraud Detection - Model Training and Imbalanced Data Handling

This notebook focuses on training various machine learning models for fraud detection while addressing the class imbalance challenge.

## Objectives:
1. Load preprocessed data
2. Implement techniques for handling class imbalance
3. Train multiple machine learning models
4. Optimize hyperparameters
5. Compare model performances
6. Select best models for each dataset

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
from collections import Counter

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import lightgbm as lgb

# Imbalanced Learning
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier

# Model Selection and Evaluation
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from sklearn.metrics import average_precision_score, f1_score, precision_score, recall_score

# Utilities
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
import joblib

warnings.filterwarnings('ignore')
np.random.seed(42)

ModuleNotFoundError: No module named 'xgboost'

## 1. Load Preprocessed Data

In [None]:
# Load preprocessed data
print("Loading preprocessed data...")

# Fraud data
X_fraud_train = pd.read_csv('../results/X_fraud_train_scaled.csv')
X_fraud_test = pd.read_csv('../results/X_fraud_test_scaled.csv')
y_fraud_train = pd.read_csv('../results/y_fraud_train.csv').squeeze()
y_fraud_test = pd.read_csv('../results/y_fraud_test.csv').squeeze()

# Credit card data
X_cc_train = pd.read_csv('../results/X_cc_train_scaled.csv')
X_cc_test = pd.read_csv('../results/X_cc_test_scaled.csv')
y_cc_train = pd.read_csv('../results/y_cc_train.csv').squeeze()
y_cc_test = pd.read_csv('../results/y_cc_test.csv').squeeze()

# Load feature information
with open('../results/feature_info.pkl', 'rb') as f:
    feature_info = pickle.load(f)

print(f"Fraud data - Train: {X_fraud_train.shape}, Test: {X_fraud_test.shape}")
print(f"Credit card data - Train: {X_cc_train.shape}, Test: {X_cc_test.shape}")

# Check class distribution
print(f"\nFraud data class distribution: {Counter(y_fraud_train)}")
print(f"Credit card data class distribution: {Counter(y_cc_train)}")

## 2. Imbalanced Data Handling Techniques

In [None]:
# 2.1 Define sampling strategies

def apply_sampling_strategy(X, y, strategy='smote', random_state=42):
    """Apply different sampling strategies to handle imbalanced data"""
    
    if strategy == 'none':
        return X, y
    
    elif strategy == 'smote':
        sampler = SMOTE(random_state=random_state)
    
    elif strategy == 'adasyn':
        sampler = ADASYN(random_state=random_state)
    
    elif strategy == 'borderline_smote':
        sampler = BorderlineSMOTE(random_state=random_state)
    
    elif strategy == 'smote_tomek':
        sampler = SMOTETomek(random_state=random_state)
    
    elif strategy == 'smote_enn':
        sampler = SMOTEENN(random_state=random_state)
    
    elif strategy == 'random_undersample':
        sampler = RandomUnderSampler(random_state=random_state)
    
    else:
        raise ValueError(f"Unknown sampling strategy: {strategy}")
    
    X_resampled, y_resampled = sampler.fit_resample(X, y)
    
    print(f"Original distribution: {Counter(y)}")
    print(f"Resampled distribution: {Counter(y_resampled)}")
    
    return X_resampled, y_resampled

# Test different sampling strategies
sampling_strategies = ['none', 'smote', 'adasyn', 'borderline_smote', 'smote_tomek']

print("Available sampling strategies:", sampling_strategies)

## 3. Model Definitions

In [None]:
# 3.1 Define models with class weight handling

def get_models(class_weight='balanced'):
    """Get dictionary of models with appropriate class weights"""
    
    models = {
        'Logistic Regression': LogisticRegression(
            class_weight=class_weight, 
            random_state=42, 
            max_iter=1000
        ),
        
        'Random Forest': RandomForestClassifier(
            class_weight=class_weight,
            random_state=42,
            n_estimators=100
        ),
        
        'Balanced Random Forest': BalancedRandomForestClassifier(
            random_state=42,
            n_estimators=100
        ),
        
        'XGBoost': xgb.XGBClassifier(
            random_state=42,
            eval_metric='logloss',
            use_label_encoder=False
        ),
        
        'LightGBM': lgb.LGBMClassifier(
            random_state=42,
            class_weight=class_weight,
            verbose=-1
        ),
        
        'Gradient Boosting': GradientBoostingClassifier(
            random_state=42
        )
    }
    
    return models

print("Model definitions created")

In [None]:
# 3.2 Evaluation metrics function

def evaluate_model(model, X_test, y_test, model_name='Model'):
    """Comprehensive model evaluation"""
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    if y_pred_proba is not None:
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        pr_auc = average_precision_score(y_test, y_pred_proba)
    else:
        roc_auc = None
        pr_auc = None
    
    results = {
        'Model': model_name,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc,
        'PR-AUC': pr_auc
    }
    
    return results, y_pred, y_pred_proba

print("Evaluation function defined")

## 4. Model Training - Fraud Data (E-commerce)

In [None]:
# 4.1 Train models on fraud data with different sampling strategies

fraud_results = []
fraud_models = {}

print("Training models on E-commerce Fraud Data...")
print("=" * 50)

# Test different sampling strategies
best_sampling_strategies = ['none', 'smote', 'borderline_smote']

for sampling_strategy in best_sampling_strategies:
    print(f"\n--- Sampling Strategy: {sampling_strategy.upper()} ---")
    
    # Apply sampling
    X_fraud_resampled, y_fraud_resampled = apply_sampling_strategy(
        X_fraud_train, y_fraud_train, strategy=sampling_strategy
    )
    
    # Get models
    models = get_models()
    
    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")
        
        try:
            # Train model
            model.fit(X_fraud_resampled, y_fraud_resampled)
            
            # Evaluate
            results, y_pred, y_pred_proba = evaluate_model(
                model, X_fraud_test, y_fraud_test, 
                f"{model_name} ({sampling_strategy})"
            )
            
            results['Sampling'] = sampling_strategy
            fraud_results.append(results)
            
            # Store best models
            key = f"{model_name}_{sampling_strategy}"
            fraud_models[key] = model
            
            print(f"F1-Score: {results['F1-Score']:.4f}, Precision: {results['Precision']:.4f}, Recall: {results['Recall']:.4f}")
            
        except Exception as e:
            print(f"Error training {model_name}: {str(e)}")
            continue

print("\nFraud data model training completed!")

In [None]:
# 4.2 Display fraud data results

fraud_results_df = pd.DataFrame(fraud_results)
fraud_results_df = fraud_results_df.sort_values('F1-Score', ascending=False)

print("E-COMMERCE FRAUD DETECTION RESULTS:")
print("=" * 60)
print(fraud_results_df.round(4))

# Best model for fraud data
best_fraud_model = fraud_results_df.iloc[0]
print(f"\nBest Model for E-commerce Fraud: {best_fraud_model['Model']}")
print(f"F1-Score: {best_fraud_model['F1-Score']:.4f}")
print(f"Precision: {best_fraud_model['Precision']:.4f}")
print(f"Recall: {best_fraud_model['Recall']:.4f}")

## 5. Model Training - Credit Card Data

In [None]:
# 5.1 Train models on credit card data

cc_results = []
cc_models = {}

print("Training models on Credit Card Data...")
print("=" * 50)

# Test different sampling strategies
for sampling_strategy in best_sampling_strategies:
    print(f"\n--- Sampling Strategy: {sampling_strategy.upper()} ---")
    
    # Apply sampling
    X_cc_resampled, y_cc_resampled = apply_sampling_strategy(
        X_cc_train, y_cc_train, strategy=sampling_strategy
    )
    
    # Get models
    models = get_models()
    
    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")
        
        try:
            # Train model
            model.fit(X_cc_resampled, y_cc_resampled)
            
            # Evaluate
            results, y_pred, y_pred_proba = evaluate_model(
                model, X_cc_test, y_cc_test, 
                f"{model_name} ({sampling_strategy})"
            )
            
            results['Sampling'] = sampling_strategy
            cc_results.append(results)
            
            # Store best models
            key = f"{model_name}_{sampling_strategy}"
            cc_models[key] = model
            
            print(f"F1-Score: {results['F1-Score']:.4f}, Precision: {results['Precision']:.4f}, Recall: {results['Recall']:.4f}")
            
        except Exception as e:
            print(f"Error training {model_name}: {str(e)}")
            continue

print("\nCredit card data model training completed!")

In [None]:
# 5.2 Display credit card results

cc_results_df = pd.DataFrame(cc_results)
cc_results_df = cc_results_df.sort_values('F1-Score', ascending=False)

print("CREDIT CARD FRAUD DETECTION RESULTS:")
print("=" * 60)
print(cc_results_df.round(4))

# Best model for credit card data
best_cc_model = cc_results_df.iloc[0]
print(f"\nBest Model for Credit Card Fraud: {best_cc_model['Model']}")
print(f"F1-Score: {best_cc_model['F1-Score']:.4f}")
print(f"Precision: {best_cc_model['Precision']:.4f}")
print(f"Recall: {best_cc_model['Recall']:.4f}")

## 6. Hyperparameter Optimization

In [None]:
# 6.1 Hyperparameter tuning for best models

def optimize_model(model_class, param_grid, X_train, y_train, cv=3, scoring='f1'):
    """Optimize hyperparameters using GridSearchCV"""
    
    grid_search = GridSearchCV(
        model_class,
        param_grid,
        cv=StratifiedKFold(n_splits=cv, shuffle=True, random_state=42),
        scoring=scoring,
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_

# Define parameter grids for top models
param_grids = {
    'RandomForestClassifier': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    
    'XGBClassifier': {
        'n_estimators': [100, 200],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0]
    },
    
    'LGBMClassifier': {
        'n_estimators': [100, 200],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 50, 100]
    }
}

print("Hyperparameter optimization setup completed")

In [None]:
# 6.2 Optimize best models for fraud data

print("Optimizing models for E-commerce Fraud Data...")

# Use SMOTE for optimization (generally performs well)
X_fraud_smote, y_fraud_smote = apply_sampling_strategy(X_fraud_train, y_fraud_train, 'smote')

optimized_fraud_models = {}

# Optimize Random Forest
print("\nOptimizing Random Forest...")
rf_optimized, rf_best_params, rf_best_score = optimize_model(
    RandomForestClassifier(class_weight='balanced', random_state=42),
    param_grids['RandomForestClassifier'],
    X_fraud_smote, y_fraud_smote
)
optimized_fraud_models['Random Forest'] = rf_optimized
print(f"Best RF params: {rf_best_params}")
print(f"Best RF score: {rf_best_score:.4f}")

# Optimize XGBoost
print("\nOptimizing XGBoost...")
xgb_optimized, xgb_best_params, xgb_best_score = optimize_model(
    xgb.XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False),
    param_grids['XGBClassifier'],
    X_fraud_smote, y_fraud_smote
)
optimized_fraud_models['XGBoost'] = xgb_optimized
print(f"Best XGB params: {xgb_best_params}")
print(f"Best XGB score: {xgb_best_score:.4f}")

In [None]:
# 6.3 Optimize best models for credit card data

print("Optimizing models for Credit Card Data...")

# Use SMOTE for optimization
X_cc_smote, y_cc_smote = apply_sampling_strategy(X_cc_train, y_cc_train, 'smote')

optimized_cc_models = {}

# Optimize Random Forest
print("\nOptimizing Random Forest...")
rf_cc_optimized, rf_cc_best_params, rf_cc_best_score = optimize_model(
    RandomForestClassifier(class_weight='balanced', random_state=42),
    param_grids['RandomForestClassifier'],
    X_cc_smote, y_cc_smote
)
optimized_cc_models['Random Forest'] = rf_cc_optimized
print(f"Best RF params: {rf_cc_best_params}")
print(f"Best RF score: {rf_cc_best_score:.4f}")

# Optimize XGBoost
print("\nOptimizing XGBoost...")
xgb_cc_optimized, xgb_cc_best_params, xgb_cc_best_score = optimize_model(
    xgb.XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False),
    param_grids['XGBClassifier'],
    X_cc_smote, y_cc_smote
)
optimized_cc_models['XGBoost'] = xgb_cc_optimized
print(f"Best XGB params: {xgb_cc_best_params}")
print(f"Best XGB score: {xgb_cc_best_score:.4f}")

## 7. Final Model Evaluation

In [None]:
# 7.1 Evaluate optimized models

print("FINAL OPTIMIZED MODEL EVALUATION")
print("=" * 50)

final_results = []

# Evaluate fraud models
print("E-commerce Fraud Detection - Optimized Models:")
for model_name, model in optimized_fraud_models.items():
    results, _, _ = evaluate_model(model, X_fraud_test, y_fraud_test, f"Fraud-{model_name}")
    results['Dataset'] = 'E-commerce Fraud'
    final_results.append(results)
    print(f"{model_name}: F1={results['F1-Score']:.4f}, Precision={results['Precision']:.4f}, Recall={results['Recall']:.4f}")

# Evaluate credit card models
print("\nCredit Card Fraud Detection - Optimized Models:")
for model_name, model in optimized_cc_models.items():
    results, _, _ = evaluate_model(model, X_cc_test, y_cc_test, f"CC-{model_name}")
    results['Dataset'] = 'Credit Card Fraud'
    final_results.append(results)
    print(f"{model_name}: F1={results['F1-Score']:.4f}, Precision={results['Precision']:.4f}, Recall={results['Recall']:.4f}")

# Create final results dataframe
final_results_df = pd.DataFrame(final_results)
print("\nFinal Results Summary:")
print(final_results_df.round(4))

## 8. Model Comparison Visualization

In [None]:
# 8.1 Visualize model performance comparison

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Fraud data results
fraud_viz_data = fraud_results_df.head(10)  # Top 10 models

# F1-Score comparison
fraud_viz_data.plot(x='Model', y='F1-Score', kind='bar', ax=axes[0,0], color='coral')
axes[0,0].set_title('E-commerce Fraud - F1-Score Comparison', fontweight='bold')
axes[0,0].tick_params(axis='x', rotation=45)
axes[0,0].set_ylabel('F1-Score')

# Precision vs Recall
axes[0,1].scatter(fraud_viz_data['Recall'], fraud_viz_data['Precision'], 
                 c=fraud_viz_data['F1-Score'], cmap='viridis', s=100)
axes[0,1].set_xlabel('Recall')
axes[0,1].set_ylabel('Precision')
axes[0,1].set_title('E-commerce Fraud - Precision vs Recall', fontweight='bold')

# Credit card data results
cc_viz_data = cc_results_df.head(10)  # Top 10 models

# F1-Score comparison
cc_viz_data.plot(x='Model', y='F1-Score', kind='bar', ax=axes[1,0], color='lightblue')
axes[1,0].set_title('Credit Card Fraud - F1-Score Comparison', fontweight='bold')
axes[1,0].tick_params(axis='x', rotation=45)
axes[1,0].set_ylabel('F1-Score')

# Precision vs Recall
axes[1,1].scatter(cc_viz_data['Recall'], cc_viz_data['Precision'], 
                 c=cc_viz_data['F1-Score'], cmap='plasma', s=100)
axes[1,1].set_xlabel('Recall')
axes[1,1].set_ylabel('Precision')
axes[1,1].set_title('Credit Card Fraud - Precision vs Recall', fontweight='bold')

plt.tight_layout()
plt.show()

## 9. Save Models and Results

In [None]:
# 9.1 Save all results and models

# Save results
fraud_results_df.to_csv('../results/fraud_model_results.csv', index=False)
cc_results_df.to_csv('../results/creditcard_model_results.csv', index=False)
final_results_df.to_csv('../results/final_optimized_results.csv', index=False)

# Save best models
best_models = {
    'fraud_models': optimized_fraud_models,
    'cc_models': optimized_cc_models,
    'fraud_best_model_name': best_fraud_model['Model'],
    'cc_best_model_name': best_cc_model['Model']
}

with open('../results/best_models.pkl', 'wb') as f:
    pickle.dump(best_models, f)

# Save individual optimized models
for name, model in optimized_fraud_models.items():
    joblib.dump(model, f'../results/fraud_{name.lower().replace(" ", "_")}_optimized.pkl')

for name, model in optimized_cc_models.items():
    joblib.dump(model, f'../results/cc_{name.lower().replace(" ", "_")}_optimized.pkl')

print("All models and results saved successfully!")
print("\nSaved files:")
print("- fraud_model_results.csv")
print("- creditcard_model_results.csv")
print("- final_optimized_results.csv")
print("- best_models.pkl")
print("- Individual optimized model files")

## 10. Model Training Summary

### Key Findings:

#### Sampling Strategies:
- **SMOTE**: Generally provides good balance between precision and recall
- **Borderline SMOTE**: Often performs better on highly imbalanced datasets
- **No Sampling + Class Weights**: Can work well with ensemble methods

#### Model Performance:
- **Tree-based models** (Random Forest, XGBoost, LightGBM) typically perform best
- **Ensemble methods** handle imbalanced data better
- **Hyperparameter optimization** significantly improves performance

#### Evaluation Metrics:
- **F1-Score**: Best overall metric for imbalanced classification
- **Precision**: Important for minimizing false positives
- **Recall**: Critical for catching actual fraud cases
- **PR-AUC**: Better than ROC-AUC for imbalanced datasets

### Next Steps:
1. Detailed model evaluation with confusion matrices
2. Model interpretation using SHAP values
3. Business impact analysis
4. Production deployment considerations