# Fraud Detection - Model Training and Imbalanced Data Handling


In [2]:
# Force compatible versions to fix binary incompatibility
!pip install -U numpy==1.23.5 scikit-learn==1.3.2 imbalanced-learn==0.11.0 --quiet
!pip install -U xgboost lightgbm tqdm joblib matplotlib seaborn pandas --quiet


In [3]:
# ✅ Core Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
from collections import Counter

# ✅ Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import lightgbm as lgb

# ✅ Model Selection & Evaluation
from sklearn.model_selection import (
    cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
)
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    average_precision_score, f1_score, precision_score, recall_score,
    precision_recall_curve
)

# ✅ Utilities
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
import joblib

# ✅ Handle Imbalanced Data
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier

# ✅ Configurations
warnings.filterwarnings('ignore')
np.random.seed(42)

print("✅ All libraries imported successfully.")


✅ All libraries imported successfully.


## 1. Load Preprocessed Data

In [4]:
import pandas as pd

# Load the training set
df_train = pd.read_csv('/kaggle/input/result/X_cc_train_scaled.csv')
print(df_train.head())


         V1        V2        V3        V4        V5        V6        V7  \
0  0.993379 -0.456037 -0.894052 -0.467284  1.089217  3.024383 -1.194852   
1  1.038507 -0.029349 -2.018302  0.175133  2.133506  2.478840 -0.001832   
2 -0.506766  0.366065  0.470114 -0.700918 -0.598748  1.470411 -1.786684   
3  1.166419 -0.909447 -0.493095 -1.178149 -1.010692 -0.262292 -1.153123   
4 -0.229485 -0.613041  0.076742 -2.440089  0.518711 -0.109914  0.407186   

         V8        V9       V10  ...  v_features_mean  v_features_std  \
0  0.957057  1.281376 -0.144546  ...         1.292780        0.371626   
1  0.566704  0.041121  0.262604  ...         0.979321        0.467642   
2 -4.227592  0.000064 -1.849641  ...        -1.531851        1.113362   
3  0.008765 -1.019866  1.617041  ...        -1.160232       -0.069793   
4 -0.095161 -0.041449 -0.514215  ...        -1.708207       -0.033004   

   v_features_max  v_features_min  v_features_range  amount_category_Very Low  \
0        1.547740        0.33

In [5]:
import pandas as pd
import pickle
from collections import Counter

print("Loading preprocessed data...")

# Fraud data
X_fraud_train = pd.read_csv('/kaggle/input/result/X_fraud_train_scaled.csv')
X_fraud_test = pd.read_csv('/kaggle/input/result/X_fraud_test_scaled.csv')
y_fraud_train = pd.read_csv('/kaggle/input/result/y_fraud_train.csv').squeeze()
y_fraud_test = pd.read_csv('/kaggle/input/result/y_fraud_test.csv').squeeze()

# Credit card data
X_cc_train = pd.read_csv('/kaggle/input/result/X_cc_train_scaled.csv')
X_cc_test = pd.read_csv('/kaggle/input/result/X_cc_test_scaled.csv')
y_cc_train = pd.read_csv('/kaggle/input/result/y_cc_train.csv').squeeze()
y_cc_test = pd.read_csv('/kaggle/input/result/y_cc_test.csv').squeeze()

# Load feature information
with open('/kaggle/input/result/feature_info.pkl', 'rb') as f:
    feature_info = pickle.load(f)

# Print dataset shapes
print(f"Fraud data - Train: {X_fraud_train.shape}, Test: {X_fraud_test.shape}")
print(f"Credit card data - Train: {X_cc_train.shape}, Test: {X_cc_test.shape}")

# Class distribution
print(f"\nFraud data class distribution: {Counter(y_fraud_train)}")
print(f"Credit card data class distribution: {Counter(y_cc_train)}")


Loading preprocessed data...
Fraud data - Train: (120889, 223), Test: (30223, 223)
Credit card data - Train: (227845, 59), Test: (56962, 59)

Fraud data class distribution: Counter({0: 109568, 1: 11321})
Credit card data class distribution: Counter({0: 227451, 1: 394})


## 2. Imbalanced Data Handling Techniques

In [6]:
def apply_sampling_strategy(X, y, strategy='smote', random_state=42):
    """Apply different sampling strategies to handle imbalanced data"""

    if strategy == 'none':
        return X, y

    elif strategy == 'smote':
        sampler = SMOTE(random_state=random_state)

    elif strategy == 'adasyn':
        sampler = ADASYN(random_state=random_state)

    elif strategy == 'borderline_smote':
        sampler = BorderlineSMOTE(random_state=random_state)

    elif strategy == 'smote_tomek':
        sampler = SMOTETomek(random_state=random_state)

    elif strategy == 'smote_enn':
        sampler = SMOTEENN(random_state=random_state)

    elif strategy == 'random_undersample':
        sampler = RandomUnderSampler(random_state=random_state)

    else:
        raise ValueError(f"Unknown sampling strategy: {strategy}")

    X_resampled, y_resampled = sampler.fit_resample(X, y)

    print(f"Original distribution: {Counter(y)}")
    print(f"Resampled distribution: {Counter(y_resampled)}")

    return X_resampled, y_resampled

# Test different sampling strategies
sampling_strategies = ['none', 'smote', 'adasyn', 'borderline_smote', 'smote_tomek']

print("Available sampling strategies:", sampling_strategies)

Available sampling strategies: ['none', 'smote', 'adasyn', 'borderline_smote', 'smote_tomek']


## 3. Model Definitions

In [7]:
def get_models(class_weight='balanced'):
    """Get dictionary of models with appropriate class weights"""

    models = {
        'Logistic Regression': LogisticRegression(
            class_weight=class_weight,
            random_state=42,
            max_iter=1000
        ),

        'Random Forest': RandomForestClassifier(
            class_weight=class_weight,
            random_state=42,
            n_estimators=100
        ),

        'Balanced Random Forest': BalancedRandomForestClassifier(
            random_state=42,
            n_estimators=100
        ),

        'XGBoost': xgb.XGBClassifier(
            random_state=42,
            eval_metric='logloss',
            use_label_encoder=False
        ),

        'LightGBM': lgb.LGBMClassifier(
            random_state=42,
            class_weight=class_weight,
            verbose=-1
        ),

        'Gradient Boosting': GradientBoostingClassifier(
            random_state=42
        )
    }

    return models

print("Model definitions created")

Model definitions created


In [8]:
def evaluate_model(model, X_test, y_test, model_name='Model'):
    """Comprehensive model evaluation"""

    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    # Calculate metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    if y_pred_proba is not None:
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        pr_auc = average_precision_score(y_test, y_pred_proba)
    else:
        roc_auc = None
        pr_auc = None

    results = {
        'Model': model_name,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc,
        'PR-AUC': pr_auc
    }

    return results, y_pred, y_pred_proba

print("Evaluation function defined")

Evaluation function defined


## 4. Model Training - Fraud Data (E-commerce)

In [9]:
fraud_results = []
fraud_models = {}

print("Training models on E-commerce Fraud Data...")
print("=" * 50)

# Test different sampling strategies
best_sampling_strategies = ['none', 'smote', 'borderline_smote']

for sampling_strategy in best_sampling_strategies:
    print(f"\n--- Sampling Strategy: {sampling_strategy.upper()} ---")

    # Apply sampling
    X_fraud_resampled, y_fraud_resampled = apply_sampling_strategy(
        X_fraud_train, y_fraud_train, strategy=sampling_strategy
    )

    # Get models
    models = get_models()

    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")

        try:
            # Train model
            model.fit(X_fraud_resampled, y_fraud_resampled)

            # Evaluate
            results, y_pred, y_pred_proba = evaluate_model(
                model, X_fraud_test, y_fraud_test,
                f"{model_name} ({sampling_strategy})"
            )

            results['Sampling'] = sampling_strategy
            fraud_results.append(results)

            # Store best models
            key = f"{model_name}_{sampling_strategy}"
            fraud_models[key] = model

            print(f"F1-Score: {results['F1-Score']:.4f}, Precision: {results['Precision']:.4f}, Recall: {results['Recall']:.4f}")

        except Exception as e:
            print(f"Error training {model_name}: {str(e)}")
            continue

print("\nFraud data model training completed!")

Training models on E-commerce Fraud Data...

--- Sampling Strategy: NONE ---

Training Logistic Regression...
F1-Score: 0.9113, Precision: 0.8406, Recall: 0.9951

Training Random Forest...
F1-Score: 0.9272, Precision: 0.9695, Recall: 0.8883

Training Balanced Random Forest...
F1-Score: 0.9111, Precision: 0.8368, Recall: 1.0000

Training XGBoost...
F1-Score: 0.9304, Precision: 0.9322, Recall: 0.9286

Training LightGBM...


[LightGBM] [Fatal] Do not support special JSON characters in feature name.


Error training LightGBM: Do not support special JSON characters in feature name.

Training Gradient Boosting...
F1-Score: 0.9338, Precision: 0.9765, Recall: 0.8947

--- Sampling Strategy: SMOTE ---
Original distribution: Counter({0: 109568, 1: 11321})
Resampled distribution: Counter({0: 109568, 1: 109568})

Training Logistic Regression...
F1-Score: 0.9099, Precision: 0.8426, Recall: 0.9890

Training Random Forest...
F1-Score: 0.9125, Precision: 0.8817, Recall: 0.9456

Training Balanced Random Forest...
F1-Score: 0.9146, Precision: 0.8816, Recall: 0.9502

Training XGBoost...
F1-Score: 0.9295, Precision: 0.9300, Recall: 0.9290

Training LightGBM...


[LightGBM] [Fatal] Do not support special JSON characters in feature name.


Error training LightGBM: Do not support special JSON characters in feature name.

Training Gradient Boosting...
F1-Score: 0.9244, Precision: 0.8672, Recall: 0.9898

--- Sampling Strategy: BORDERLINE_SMOTE ---
Original distribution: Counter({0: 109568, 1: 11321})
Resampled distribution: Counter({0: 109568, 1: 109568})

Training Logistic Regression...
F1-Score: 0.9109, Precision: 0.8413, Recall: 0.9929

Training Random Forest...
F1-Score: 0.9139, Precision: 0.8867, Recall: 0.9428

Training Balanced Random Forest...
F1-Score: 0.9102, Precision: 0.8820, Recall: 0.9403

Training XGBoost...
F1-Score: 0.9283, Precision: 0.9237, Recall: 0.9329

Training LightGBM...


[LightGBM] [Fatal] Do not support special JSON characters in feature name.


Error training LightGBM: Do not support special JSON characters in feature name.

Training Gradient Boosting...
F1-Score: 0.9223, Precision: 0.8630, Recall: 0.9905

Fraud data model training completed!


## Display fraud data results

In [10]:
fraud_results_df = pd.DataFrame(fraud_results)
fraud_results_df = fraud_results_df.sort_values('F1-Score', ascending=False)

print("E-COMMERCE FRAUD DETECTION RESULTS:")
print("=" * 60)
print(fraud_results_df.round(4))

# Best model for fraud data
best_fraud_model = fraud_results_df.iloc[0]
print(f"\nBest Model for E-commerce Fraud: {best_fraud_model['Model']}")
print(f"F1-Score: {best_fraud_model['F1-Score']:.4f}")
print(f"Precision: {best_fraud_model['Precision']:.4f}")
print(f"Recall: {best_fraud_model['Recall']:.4f}")

E-COMMERCE FRAUD DETECTION RESULTS:
                                        Model  Precision  Recall  F1-Score  \
4                    Gradient Boosting (none)     0.9765  0.8947    0.9338   
3                              XGBoost (none)     0.9322  0.9286    0.9304   
8                             XGBoost (smote)     0.9300  0.9290    0.9295   
13                 XGBoost (borderline_smote)     0.9237  0.9329    0.9283   
1                        Random Forest (none)     0.9695  0.8883    0.9272   
9                   Gradient Boosting (smote)     0.8672  0.9898    0.9244   
14       Gradient Boosting (borderline_smote)     0.8630  0.9905    0.9223   
7              Balanced Random Forest (smote)     0.8816  0.9502    0.9146   
11           Random Forest (borderline_smote)     0.8867  0.9428    0.9139   
6                       Random Forest (smote)     0.8817  0.9456    0.9125   
0                  Logistic Regression (none)     0.8406  0.9951    0.9113   
2               Balanced Ran

## 5. Model Training - Credit Card Data

In [11]:
cc_results = []
cc_models = {}

print("Training models on Credit Card Data...")
print("=" * 50)

# Test different sampling strategies
for sampling_strategy in best_sampling_strategies:
    print(f"\n--- Sampling Strategy: {sampling_strategy.upper()} ---")

    # Apply sampling
    X_cc_resampled, y_cc_resampled = apply_sampling_strategy(
        X_cc_train, y_cc_train, strategy=sampling_strategy
    )

    # Get models
    models = get_models()

    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")

        try:
            # Train model
            model.fit(X_cc_resampled, y_cc_resampled)

            # Evaluate
            results, y_pred, y_pred_proba = evaluate_model(
                model, X_cc_test, y_cc_test,
                f"{model_name} ({sampling_strategy})"
            )

            results['Sampling'] = sampling_strategy
            cc_results.append(results)

            # Store best models
            key = f"{model_name}_{sampling_strategy}"
            cc_models[key] = model

            print(f"F1-Score: {results['F1-Score']:.4f}, Precision: {results['Precision']:.4f}, Recall: {results['Recall']:.4f}")

        except Exception as e:
            print(f"Error training {model_name}: {str(e)}")
            continue

print("\nCredit card data model training completed!")

Training models on Credit Card Data...

--- Sampling Strategy: NONE ---

Training Logistic Regression...
F1-Score: 0.0955, Precision: 0.0504, Recall: 0.9082

Training Random Forest...
F1-Score: 0.8475, Precision: 0.9494, Recall: 0.7653

Training Balanced Random Forest...
F1-Score: 0.1066, Precision: 0.0566, Recall: 0.9184

Training XGBoost...
F1-Score: 0.8556, Precision: 0.8989, Recall: 0.8163

Training LightGBM...
F1-Score: 0.8458, Precision: 0.8252, Recall: 0.8673

Training Gradient Boosting...
F1-Score: 0.8066, Precision: 0.8795, Recall: 0.7449

--- Sampling Strategy: SMOTE ---
Original distribution: Counter({0: 227451, 1: 394})
Resampled distribution: Counter({0: 227451, 1: 227451})

Training Logistic Regression...
F1-Score: 0.0926, Precision: 0.0488, Recall: 0.9082

Training Random Forest...
F1-Score: 0.8646, Precision: 0.8830, Recall: 0.8469

Training Balanced Random Forest...
F1-Score: 0.8571, Precision: 0.8901, Recall: 0.8265

Training XGBoost...
F1-Score: 0.8513, Precision: 0.

## Display credit card results

In [12]:
cc_results_df = pd.DataFrame(cc_results)
cc_results_df = cc_results_df.sort_values('F1-Score', ascending=False)

print("CREDIT CARD FRAUD DETECTION RESULTS:")
print("=" * 60)
print(cc_results_df.round(4))

# Best model for credit card data
best_cc_model = cc_results_df.iloc[0]
print(f"\nBest Model for Credit Card Fraud: {best_cc_model['Model']}")
print(f"F1-Score: {best_cc_model['F1-Score']:.4f}")
print(f"Precision: {best_cc_model['Precision']:.4f}")
print(f"Recall: {best_cc_model['Recall']:.4f}")

CREDIT CARD FRAUD DETECTION RESULTS:
                                        Model  Precision  Recall  F1-Score  \
7                       Random Forest (smote)     0.8830  0.8469    0.8646   
8              Balanced Random Forest (smote)     0.8901  0.8265    0.8571   
3                              XGBoost (none)     0.8989  0.8163    0.8556   
9                             XGBoost (smote)     0.8557  0.8469    0.8513   
15                 XGBoost (borderline_smote)     0.8557  0.8469    0.8513   
13           Random Forest (borderline_smote)     0.9277  0.7857    0.8508   
1                        Random Forest (none)     0.9494  0.7653    0.8475   
4                             LightGBM (none)     0.8252  0.8673    0.8458   
14  Balanced Random Forest (borderline_smote)     0.9157  0.7755    0.8398   
5                    Gradient Boosting (none)     0.8795  0.7449    0.8066   
16                LightGBM (borderline_smote)     0.7736  0.8367    0.8039   
10                         

## 6. Hyperparameter Optimization

In [13]:
def optimize_model(model_class, param_grid, X_train, y_train, cv=3, scoring='f1'):
    """Optimize hyperparameters using GridSearchCV"""

    grid_search = GridSearchCV(
        model_class,
        param_grid,
        cv=StratifiedKFold(n_splits=cv, shuffle=True, random_state=42),
        scoring=scoring,
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train, y_train)

    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_

# Define parameter grids for top models
param_grids = {
    'RandomForestClassifier': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },

    'XGBClassifier': {
        'n_estimators': [100, 200],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0]
    },

    'LGBMClassifier': {
        'n_estimators': [100, 200],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'num_leaves': [31, 50, 100]
    }
}

print("Hyperparameter optimization setup completed")

Hyperparameter optimization setup completed


## Optimize best models for fraud data

In [14]:
print("Optimizing models for E-commerce Fraud Data...")
# Use SMOTE for optimization (generally performs well)
X_fraud_smote, y_fraud_smote = apply_sampling_strategy(X_fraud_train, y_fraud_train, 'smote')
optimized_fraud_models = {}
# Optimize Random Forest
print("\nOptimizing Random Forest...")
rf_optimized, rf_best_params, rf_best_score = optimize_model(
    RandomForestClassifier(class_weight='balanced', random_state=42),
    param_grids['RandomForestClassifier'],
    X_fraud_smote, y_fraud_smote
)
optimized_fraud_models['Random Forest'] = rf_optimized
print(f"Best RF params: {rf_best_params}")
print(f"Best RF score: {rf_best_score:.4f}")

# Optimize XGBoost
print("\nOptimizing XGBoost...")
xgb_optimized, xgb_best_params, xgb_best_score = optimize_model(
    xgb.XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False),
    param_grids['XGBClassifier'],
    X_fraud_smote, y_fraud_smote
)
optimized_fraud_models['XGBoost'] = xgb_optimized
print(f"Best XGB params: {xgb_best_params}")
print(f"Best XGB score: {xgb_best_score:.4f}")

Optimizing models for E-commerce Fraud Data...
Original distribution: Counter({0: 109568, 1: 11321})
Resampled distribution: Counter({0: 109568, 1: 109568})

Optimizing Random Forest...
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best RF params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best RF score: 0.9922

Optimizing XGBoost...
Fitting 3 folds for each of 36 candidates, totalling 108 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best XGB params: {'learning_rate': 0.2, 'max_depth': 9, 'n_estimators': 200, 'subsample': 0.8}
Best XGB score: 0.9936


## Optimize best models for credit card data


In [None]:

print("Optimizing models for Credit Card Data...")

# Use SMOTE for optimization
X_cc_smote, y_cc_smote = apply_sampling_strategy(X_cc_train, y_cc_train, 'smote')

optimized_cc_models = {}

# Optimize Random Forest
print("\nOptimizing Random Forest...")
rf_cc_optimized, rf_cc_best_params, rf_cc_best_score = optimize_model(
    RandomForestClassifier(class_weight='balanced', random_state=42),
    param_grids['RandomForestClassifier'],
    X_cc_smote, y_cc_smote
)
optimized_cc_models['Random Forest'] = rf_cc_optimized
print(f"Best RF params: {rf_cc_best_params}")
print(f"Best RF score: {rf_cc_best_score:.4f}")

# Optimize XGBoost
print("\nOptimizing XGBoost...")
xgb_cc_optimized, xgb_cc_best_params, xgb_cc_best_score = optimize_model(
    xgb.XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False),
    param_grids['XGBClassifier'],
    X_cc_smote, y_cc_smote
)
optimized_cc_models['XGBoost'] = xgb_cc_optimized
print(f"Best XGB params: {xgb_cc_best_params}")
print(f"Best XGB score: {xgb_cc_best_score:.4f}")

## 7. Final Model Evaluation

In [None]:
print("FINAL OPTIMIZED MODEL EVALUATION")
print("=" * 50)

final_results = []

# Evaluate fraud models
print("E-commerce Fraud Detection - Optimized Models:")
for model_name, model in optimized_fraud_models.items():
    results, _, _ = evaluate_model(model, X_fraud_test, y_fraud_test, f"Fraud-{model_name}")
    results['Dataset'] = 'E-commerce Fraud'
    final_results.append(results)
    print(f"{model_name}: F1={results['F1-Score']:.4f}, Precision={results['Precision']:.4f}, Recall={results['Recall']:.4f}")

# Evaluate credit card models
print("\nCredit Card Fraud Detection - Optimized Models:")
for model_name, model in optimized_cc_models.items():
    results, _, _ = evaluate_model(model, X_cc_test, y_cc_test, f"CC-{model_name}")
    results['Dataset'] = 'Credit Card Fraud'
    final_results.append(results)
    print(f"{model_name}: F1={results['F1-Score']:.4f}, Precision={results['Precision']:.4f}, Recall={results['Recall']:.4f}")

# Create final results dataframe
final_results_df = pd.DataFrame(final_results)
print("\nFinal Results Summary:")
print(final_results_df.round(4))

## 8. Model Comparison Visualization

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Fraud data results
fraud_viz_data = fraud_results_df.head(10)  # Top 10 models

# F1-Score comparison
fraud_viz_data.plot(x='Model', y='F1-Score', kind='bar', ax=axes[0,0], color='coral')
axes[0,0].set_title('E-commerce Fraud - F1-Score Comparison', fontweight='bold')
axes[0,0].tick_params(axis='x', rotation=45)
axes[0,0].set_ylabel('F1-Score')

# Precision vs Recall
axes[0,1].scatter(fraud_viz_data['Recall'], fraud_viz_data['Precision'],
                 c=fraud_viz_data['F1-Score'], cmap='viridis', s=100)
axes[0,1].set_xlabel('Recall')
axes[0,1].set_ylabel('Precision')
axes[0,1].set_title('E-commerce Fraud - Precision vs Recall', fontweight='bold')

# Credit card data results
cc_viz_data = cc_results_df.head(10)  # Top 10 models

# F1-Score comparison
cc_viz_data.plot(x='Model', y='F1-Score', kind='bar', ax=axes[1,0], color='lightblue')
axes[1,0].set_title('Credit Card Fraud - F1-Score Comparison', fontweight='bold')
axes[1,0].tick_params(axis='x', rotation=45)
axes[1,0].set_ylabel('F1-Score')

# Precision vs Recall
axes[1,1].scatter(cc_viz_data['Recall'], cc_viz_data['Precision'],
                 c=cc_viz_data['F1-Score'], cmap='plasma', s=100)
axes[1,1].set_xlabel('Recall')
axes[1,1].set_ylabel('Precision')
axes[1,1].set_title('Credit Card Fraud - Precision vs Recall', fontweight='bold')

plt.tight_layout()
plt.show()

## 9. Save Models and Results

In [None]:
import os
import pickle
import joblib

# Ensure results directory exists
os.makedirs('./results', exist_ok=True)

# Save results
fraud_results_df.to_csv('./results/fraud_model_results.csv', index=False)
cc_results_df.to_csv('./results/creditcard_model_results.csv', index=False)
final_results_df.to_csv('./results/final_optimized_results.csv', index=False)

# Save best models dictionary
best_models = {
    'fraud_models': optimized_fraud_models,
    'cc_models': optimized_cc_models,
    'fraud_best_model_name': best_fraud_model['Model'],
    'cc_best_model_name': best_cc_model['Model']
}
with open('./results/best_models.pkl', 'wb') as f:
    pickle.dump(best_models, f)

# Save individual optimized models
for name, model in optimized_fraud_models.items():
    joblib.dump(model, f'./results/fraud_{name.lower().replace(" ", "_")}_optimized.pkl')

for name, model in optimized_cc_models.items():
    joblib.dump(model, f'./results/cc_{name.lower().replace(" ", "_")}_optimized.pkl')

# Summary
print("All models and results saved successfully!")
print("\nSaved files:")
print("- fraud_model_results.csv")
print("- creditcard_model_results.csv")
print("- final_optimized_results.csv")
print("- best_models.pkl")
print("- Individual optimized model files")


In [None]:
# Retry Gradient Boosting training with SMOTE for Credit Card Data with error handling
print("\nRetrying Gradient Boosting with SMOTE on Credit Card Data...")

sampling_strategy = 'smote'
model_name = 'Gradient Boosting'

try:
    # Apply sampling
    X_cc_resampled, y_cc_resampled = apply_sampling_strategy(
        X_cc_train, y_cc_train, strategy=sampling_strategy
    )

    # Get model
    models = get_models()
    model = models[model_name]

    print(f"\nTraining {model_name} with {sampling_strategy.upper()}...")

    # Train model
    model.fit(X_cc_resampled, y_cc_resampled)

    # Evaluate
    results, y_pred, y_pred_proba = evaluate_model(
        model, X_cc_test, y_cc_test,
        f"{model_name} ({sampling_strategy})"
    )

    results['Sampling'] = sampling_strategy
    cc_results.append(results)

    # Store model
    key = f"{model_name}_{sampling_strategy}"
    cc_models[key] = model

    print(f"F1-Score: {results['F1-Score']:.4f}, Precision: {results['Precision']:.4f}, Recall: {results['Recall']:.4f}")
    print(f"Successfully trained and evaluated {model_name} with {sampling_strategy.upper()}")

except Exception as e:
    print(f"Error training {model_name} with {sampling_strategy.upper()}: {str(e)}")

print("\nRetry attempt completed.")