# Credit Card Fraud Detection - Improved Approach

This notebook implements credit card fraud detection using:
- Feature selection techniques
- Multiple approaches for handling class imbalance (SMOTE, class weights)
- Two machine learning models (Logistic Regression & Random Forest)
- Threshold optimization for better performance
- Comprehensive evaluation metrics

## 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Imbalanced Learning
from imblearn.over_sampling import SMOTE

# Evaluation Metrics
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, 
    roc_curve, precision_recall_curve, f1_score, 
    accuracy_score, precision_score, recall_score,
    average_precision_score, matthews_corrcoef
)

%matplotlib inline
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 2. Load and Explore Data

In [None]:
# Load the dataset
df = pd.read_csv('creditcard.csv')
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Basic information about the dataset
print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"\nData types:")
print(df.dtypes.value_counts())
print(f"\nMissing values:")
print(df.isnull().sum().sum())
print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Statistical summary
df.describe()

## 3. Class Distribution Analysis

In [None]:
# Class distribution
class_counts = df['Class'].value_counts()
print("Class Distribution:")
print(f"Normal transactions (Class 0): {class_counts[0]:,} ({class_counts[0]/len(df)*100:.3f}%)")
print(f"Fraud transactions (Class 1): {class_counts[1]:,} ({class_counts[1]/len(df)*100:.3f}%)")
print(f"Imbalance ratio: 1:{class_counts[0]//class_counts[1]}")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot
class_counts.plot(kind='bar', ax=axes[0], color=['skyblue', 'lightcoral'])
axes[0].set_title('Class Distribution (Count)')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=0)

# Pie chart
axes[1].pie(class_counts.values, labels=['Normal (0)', 'Fraud (1)'], 
           autopct='%1.3f%%', colors=['skyblue', 'lightcoral'])
axes[1].set_title('Class Distribution (Percentage)')

plt.tight_layout()
plt.show()

## 4. Feature Analysis and Correlation

In [None]:
# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures: {list(X.columns)}")

In [None]:
# Correlation with target variable
correlations = X.corrwith(y).abs().sort_values(ascending=False)
print("Top 15 features correlated with target:")
print(correlations.head(15))

# Visualization
plt.figure(figsize=(12, 8))
correlations.plot(kind='bar', color='steelblue')
plt.title('Feature Correlation with Target Variable (Absolute Values)')
plt.xlabel('Features')
plt.ylabel('Absolute Correlation')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 5. Data Preprocessing and Feature Engineering

In [None]:
# Simple feature engineering
def create_additional_features(df):
    """Create additional features from existing ones"""
    df_new = df.copy()
    
    # Time-based features
    df_new['Hour'] = (df_new['Time'] / 3600) % 24
    df_new['Day'] = (df_new['Time'] / 86400) % 7
    
    # Amount-based features
    df_new['Amount_log'] = np.log1p(df_new['Amount'])
    df_new['Amount_sqrt'] = np.sqrt(df_new['Amount'])
    
    # Statistical features from V1-V28
    v_features = [f'V{i}' for i in range(1, 29)]
    df_new['V_sum'] = df_new[v_features].sum(axis=1)
    df_new['V_mean'] = df_new[v_features].mean(axis=1)
    df_new['V_std'] = df_new[v_features].std(axis=1)
    
    return df_new

# Apply feature engineering
df_engineered = create_additional_features(df)
print(f"Original features: {df.shape[1]}")
print(f"After feature engineering: {df_engineered.shape[1]}")
print(f"New features added: {df_engineered.shape[1] - df.shape[1]}")

In [None]:
# Update X with engineered features
X_engineered = df_engineered.drop('Class', axis=1)
y = df_engineered['Class']

print(f"Engineered features shape: {X_engineered.shape}")
print(f"New features: {[col for col in X_engineered.columns if col not in X.columns]}")

## 6. Feature Selection

In [None]:
# Split data for feature selection
X_temp, X_test, y_temp, y_test = train_test_split(
    X_engineered, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_temp.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Feature selection using SelectKBest
k_best = 20  # Select top 20 features
selector = SelectKBest(score_func=f_classif, k=k_best)
X_selected = selector.fit_transform(X_temp, y_temp)
selected_features = X_temp.columns[selector.get_support()].tolist()
feature_scores = selector.scores_[selector.get_support()]

print(f"Selected {len(selected_features)} features:")
feature_df = pd.DataFrame({
    'Feature': selected_features,
    'Score': feature_scores
}).sort_values('Score', ascending=False)

print(feature_df)

# Visualize feature scores
plt.figure(figsize=(12, 8))
plt.barh(range(len(feature_df)), feature_df['Score'])
plt.yticks(range(len(feature_df)), feature_df['Feature'])
plt.xlabel('F-Score')
plt.title('Selected Features by F-Score')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 7. Data Scaling

In [None]:
# Prepare data with selected features
X_train_sel = X_temp[selected_features]
X_test_sel = X_test[selected_features]

print(f"Training set with selected features: {X_train_sel.shape}")
print(f"Test set with selected features: {X_test_sel.shape}")

# Scale the features
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_sel)
X_test_scaled = scaler.transform(X_test_sel)

# Convert back to DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=selected_features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=selected_features)

print("\nData scaling completed.")
print(f"Training set mean: {X_train_scaled.mean().mean():.6f}")
print(f"Training set std: {X_train_scaled.std().mean():.6f}")

## 8. Class Imbalance Handling Approaches

In [None]:
print("Class distribution before balancing:")
print(y_temp.value_counts())
print(f"Imbalance ratio: 1:{y_temp.value_counts()[0] // y_temp.value_counts()[1]}")

# Approach 1: SMOTE (Synthetic Minority Oversampling)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_temp)

print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())

# Visualize the effect
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Before SMOTE
y_temp.value_counts().plot(kind='bar', ax=axes[0], color=['skyblue', 'lightcoral'])
axes[0].set_title('Class Distribution - Before SMOTE')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=0)

# After SMOTE
pd.Series(y_train_smote).value_counts().plot(kind='bar', ax=axes[1], color=['skyblue', 'lightcoral'])
axes[1].set_title('Class Distribution - After SMOTE')
axes[1].set_xlabel('Class')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

## 9. Model Training and Evaluation

In [None]:
# Define models with different approaches to handle imbalance
models_config = {
    # Original models without class balancing
    'Logistic Regression (Original)': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'data_type': 'Original'
    },
    'Random Forest (Original)': {
        'model': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
        'data_type': 'Original'
    },
    
    # Models with class weights (alternative to SMOTE)
    'Logistic Regression (Balanced)': {
        'model': LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'),
        'data_type': 'Original'
    },
    'Random Forest (Balanced)': {
        'model': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced'),
        'data_type': 'Original'
    },
    
    # Models with SMOTE
    'Logistic Regression (SMOTE)': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'data_type': 'SMOTE'
    },
    'Random Forest (SMOTE)': {
        'model': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
        'data_type': 'SMOTE'
    }
}

print(f"Model configurations to evaluate: {len(models_config)}")
for name in models_config.keys():
    print(f"  • {name}")

In [None]:
# Evaluation function
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name="Model"):
    """Comprehensive model evaluation"""
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    avg_precision = average_precision_score(y_test, y_pred_proba)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Specificity
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    results = {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc,
        'PR-AUC': avg_precision,
        'MCC': mcc,
        'Specificity': specificity,
        'TP': tp,
        'TN': tn,
        'FP': fp,
        'FN': fn,
        'Model_Object': model,
        'Predictions': y_pred,
        'Probabilities': y_pred_proba
    }
    
    return results

print("Evaluation function defined.")

In [None]:
# Evaluate all model configurations
all_results = []

print("Evaluating models...\n")

for model_name, config in models_config.items():
    print(f"Training and evaluating: {model_name}")
    
    model = config['model']
    data_type = config['data_type']
    
    # Choose appropriate training data
    if data_type == 'SMOTE':
        X_train = X_train_smote
        y_train = y_train_smote
    else:
        X_train = X_train_scaled
        y_train = y_temp
    
    # Evaluate model
    result = evaluate_model(
        model, X_train, y_train,
        X_test_scaled, y_test,
        model_name
    )
    result['Data_Type'] = data_type
    result['Base_Model'] = model_name.split(' (')[0]
    all_results.append(result)
    
    print(f"  F1-Score: {result['F1-Score']:.4f}, ROC-AUC: {result['ROC-AUC']:.4f}")
    print(f"  Precision: {result['Precision']:.4f}, Recall: {result['Recall']:.4f}\n")

print(f"Total evaluations completed: {len(all_results)}")

## 10. Results Analysis and Comparison

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(all_results)
metrics_cols = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'PR-AUC', 'MCC', 'Specificity']
display_df = results_df[['Model', 'Base_Model', 'Data_Type'] + metrics_cols].copy()

# Round numerical columns
for col in metrics_cols:
    display_df[col] = display_df[col].round(4)

# Sort by F1-Score
display_df_sorted = display_df.sort_values('F1-Score', ascending=False)

print("Model Performance Comparison (Sorted by F1-Score):")
print(display_df_sorted)

print("\n" + "="*80)
print("KEY INSIGHTS:")
print("="*80)

# Compare Logistic Regression approaches
lr_results = display_df[display_df['Base_Model'] == 'Logistic Regression'].sort_values('F1-Score', ascending=False)
print("\nLogistic Regression Performance:")
for idx, row in lr_results.iterrows():
    approach = row['Model'].split('(')[1].replace(')', '')
    print(f"  {approach:10}: F1={row['F1-Score']:.4f}, Precision={row['Precision']:.4f}, Recall={row['Recall']:.4f}")

# Compare Random Forest approaches
rf_results = display_df[display_df['Base_Model'] == 'Random Forest'].sort_values('F1-Score', ascending=False)
print("\nRandom Forest Performance:")
for idx, row in rf_results.iterrows():
    approach = row['Model'].split('(')[1].replace(')', '')
    print(f"  {approach:10}: F1={row['F1-Score']:.4f}, Precision={row['Precision']:.4f}, Recall={row['Recall']:.4f}")

In [None]:
# Performance comparison visualization
fig, axes = plt.subplots(2, 2, figsize=(18, 14))

metrics_to_plot = ['F1-Score', 'ROC-AUC', 'Precision', 'Recall']

for i, metric in enumerate(metrics_to_plot):
    ax = axes[i//2, i%2]
    
    # Create grouped bar plot
    model_names = [result['Model'] for result in all_results]
    metric_values = [result[metric] for result in all_results]
    
    # Color code by approach
    colors = []
    for name in model_names:
        if 'Original' in name:
            colors.append('lightblue')
        elif 'Balanced' in name:
            colors.append('orange')
        elif 'SMOTE' in name:
            colors.append('lightcoral')
    
    bars = ax.bar(range(len(model_names)), metric_values, color=colors)
    ax.set_xticks(range(len(model_names)))
    ax.set_xticklabels([name.replace(' (', '\n(') for name in model_names], 
                       rotation=45, ha='right', fontsize=9)
    ax.set_ylabel(metric)
    ax.set_title(f'{metric} Comparison Across Approaches')
    ax.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for bar, value in zip(bars, metric_values):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(metric_values)*0.01,
               f'{value:.3f}', ha='center', va='bottom', fontsize=8)

# Add legend
legend_elements = [
    plt.Rectangle((0,0),1,1, facecolor='lightblue', label='Original Data'),
    plt.Rectangle((0,0),1,1, facecolor='orange', label='Class Weights'),
    plt.Rectangle((0,0),1,1, facecolor='lightcoral', label='SMOTE')
]
fig.legend(handles=legend_elements, loc='center', bbox_to_anchor=(0.5, 0.02), ncol=3)

plt.tight_layout()
plt.subplots_adjust(bottom=0.1)
plt.show()

In [None]:
# Best model analysis
best_result = max(all_results, key=lambda x: x['F1-Score'])

print(f"🏆 BEST PERFORMING MODEL: {best_result['Model']}")
print("\nPerformance Metrics:")
for metric in metrics_cols:
    value = best_result[metric]
    print(f"  {metric}: {value:.4f}")

# Confusion Matrix for best model
cm = confusion_matrix(y_test, best_result['Predictions'])
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
           xticklabels=['Normal', 'Fraud'], 
           yticklabels=['Normal', 'Fraud'])
plt.title(f'Confusion Matrix - {best_result["Model"]}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

print(f"\nConfusion Matrix Analysis:")
print(f"  True Negatives (Correctly identified normal): {best_result['TN']:,}")
print(f"  False Positives (False alarms): {best_result['FP']:,}")
print(f"  False Negatives (Missed fraud): {best_result['FN']:,}")
print(f"  True Positives (Correctly identified fraud): {best_result['TP']:,}")

print(f"\nModel Performance:")
print(f"  Detection Rate: {best_result['TP']/(best_result['TP']+best_result['FN'])*100:.1f}%")
print(f"  False Alarm Rate: {best_result['FP']/(best_result['FP']+best_result['TN'])*100:.3f}%")

In [None]:
# ROC and Precision-Recall curves comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Define colors and styles for different approaches
approach_styles = {
    'Original': {'color': 'blue', 'linestyle': '-'},
    'Balanced': {'color': 'orange', 'linestyle': '--'},
    'SMOTE': {'color': 'red', 'linestyle': ':'}
}

# ROC Curves
for result in all_results:
    fpr, tpr, _ = roc_curve(y_test, result['Probabilities'])
    auc_score = result['ROC-AUC']
    
    # Determine style based on data type
    if 'Balanced' in result['Model']:
        style = approach_styles['Balanced']
    elif 'SMOTE' in result['Model']:
        style = approach_styles['SMOTE']
    else:
        style = approach_styles['Original']
    
    axes[0].plot(fpr, tpr, label=f"{result['Model']} (AUC: {auc_score:.3f})",
                color=style['color'], linestyle=style['linestyle'])

axes[0].plot([0, 1], [0, 1], 'k--', alpha=0.6, label='Random Classifier')
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curves - All Models')
axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0].grid(True, alpha=0.3)

# Precision-Recall Curves
for result in all_results:
    precision, recall, _ = precision_recall_curve(y_test, result['Probabilities'])
    pr_auc = result['PR-AUC']
    
    # Determine style based on data type
    if 'Balanced' in result['Model']:
        style = approach_styles['Balanced']
    elif 'SMOTE' in result['Model']:
        style = approach_styles['SMOTE']
    else:
        style = approach_styles['Original']
    
    axes[1].plot(recall, precision, label=f"{result['Model']} (AUC: {pr_auc:.3f})",
                color=style['color'], linestyle=style['linestyle'])

# Baseline for PR curve
baseline = (y_test == 1).sum() / len(y_test)
axes[1].axhline(y=baseline, color='k', linestyle='--', alpha=0.6, 
               label=f'Random Classifier ({baseline:.3f})')
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('Precision-Recall Curves - All Models')
axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 11. Feature Importance Analysis

In [None]:
# Feature importance for the best Random Forest model
rf_results = [r for r in all_results if 'Random Forest' in r['Model']]

if rf_results:
    # Get the best Random Forest model
    best_rf = max(rf_results, key=lambda x: x['F1-Score'])
    rf_model = best_rf['Model_Object']
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'Feature': selected_features,
        'Importance': rf_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(15)
    plt.barh(range(len(top_features)), top_features['Importance'])
    plt.yticks(range(len(top_features)), top_features['Feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Top 15 Feature Importance - {best_rf["Model"]}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print(f"Top 10 Most Important Features ({best_rf['Model']}):")
    print(feature_importance.head(10))
else:
    print("No Random Forest models found in results.")

## 12. Threshold Optimization (Optional Enhancement)

In [None]:
# Threshold optimization for the best model
print("Optimizing decision threshold for the best model...")

best_model = best_result['Model_Object']
best_probabilities = best_result['Probabilities']

# Test different thresholds
thresholds = np.arange(0.1, 0.9, 0.05)
threshold_results = []

for threshold in thresholds:
    y_pred_thresh = (best_probabilities >= threshold).astype(int)
    
    # Calculate metrics for this threshold
    precision = precision_score(y_test, y_pred_thresh)
    recall = recall_score(y_test, y_pred_thresh)
    f1 = f1_score(y_test, y_pred_thresh)
    
    threshold_results.append({
        'Threshold': threshold,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    })

# Convert to DataFrame
threshold_df = pd.DataFrame(threshold_results)
best_threshold = threshold_df.loc[threshold_df['F1-Score'].idxmax(), 'Threshold']
best_f1_optimized = threshold_df['F1-Score'].max()

print(f"Best threshold: {best_threshold:.3f}")
print(f"Optimized F1-Score: {best_f1_optimized:.4f}")
print(f"Original F1-Score: {best_result['F1-Score']:.4f}")
print(f"Improvement: {best_f1_optimized - best_result['F1-Score']:+.4f}")

# Plot threshold optimization
plt.figure(figsize=(12, 6))
plt.plot(threshold_df['Threshold'], threshold_df['Precision'], 'b-', label='Precision', linewidth=2)
plt.plot(threshold_df['Threshold'], threshold_df['Recall'], 'r-', label='Recall', linewidth=2)
plt.plot(threshold_df['Threshold'], threshold_df['F1-Score'], 'g-', label='F1-Score', linewidth=2)
plt.axvline(x=best_threshold, color='black', linestyle='--', alpha=0.7, label=f'Best Threshold ({best_threshold:.3f})')
plt.axvline(x=0.5, color='gray', linestyle=':', alpha=0.5, label='Default Threshold (0.5)')
plt.xlabel('Decision Threshold')
plt.ylabel('Score')
plt.title(f'Threshold Optimization - {best_result["Model"]}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 13. Summary and Recommendations

In [None]:
# Comprehensive summary report
print("CREDIT CARD FRAUD DETECTION - IMPROVED ANALYSIS SUMMARY")
print("=" * 70)

print(f"\n📊 DATASET OVERVIEW:")
print(f"  • Total transactions: {len(df):,}")
print(f"  • Normal transactions: {(df['Class'] == 0).sum():,} ({(df['Class'] == 0).mean()*100:.3f}%)")
print(f"  • Fraud transactions: {(df['Class'] == 1).sum():,} ({(df['Class'] == 1).mean()*100:.3f}%)")
print(f"  • Original features: {len(X.columns)}")
print(f"  • Engineered features: {len(X_engineered.columns)}")
print(f"  • Selected features: {len(selected_features)}")

print(f"\n🔍 FEATURE SELECTION:")
print(f"  • Method used: SelectKBest with f_classif")
print(f"  • Top 5 features: {feature_df['Feature'].head(5).tolist()}")

print(f"\n⚖️ IMBALANCE HANDLING APPROACHES TESTED:")
print(f"  • Original data (no balancing)")
print(f"  • Class weights (balanced)")
print(f"  • SMOTE (synthetic oversampling)")

print(f"\n🤖 MODELS EVALUATED:")
print(f"  • Logistic Regression (3 approaches)")
print(f"  • Random Forest (3 approaches)")
print(f"  • Total combinations: {len(all_results)}")

print(f"\n🏆 BEST MODEL PERFORMANCE:")
print(f"  • Best Model: {best_result['Model']}")
print(f"  • F1-Score: {best_result['F1-Score']:.4f}")
print(f"  • ROC-AUC: {best_result['ROC-AUC']:.4f}")
print(f"  • Precision: {best_result['Precision']:.4f}")
print(f"  • Recall: {best_result['Recall']:.4f}")

if 'best_f1_optimized' in locals():
    print(f"  • Optimized F1-Score: {best_f1_optimized:.4f} (with threshold {best_threshold:.3f})")

print(f"\n📈 KEY FINDINGS:")

# Compare Logistic Regression approaches
lr_original = [r for r in all_results if r['Model'] == 'Logistic Regression (Original)'][0]
lr_balanced = [r for r in all_results if r['Model'] == 'Logistic Regression (Balanced)'][0]
lr_smote = [r for r in all_results if r['Model'] == 'Logistic Regression (SMOTE)'][0]

print(f"\nLogistic Regression F1-Score Comparison:")
print(f"  • Original data: {lr_original['F1-Score']:.4f}")
print(f"  • Class weights: {lr_balanced['F1-Score']:.4f}")
print(f"  • SMOTE: {lr_smote['F1-Score']:.4f}")

if lr_smote['F1-Score'] < lr_original['F1-Score']:
    print(f"  ⚠️ SMOTE hurt Logistic Regression performance!")
    print(f"     F1-Score dropped by {lr_original['F1-Score'] - lr_smote['F1-Score']:.3f}")
    print(f"     This suggests original data distribution is important for this model.")

print(f"\n💡 RECOMMENDATIONS:")

# Determine best approach for each model
best_lr_approach = max([lr_original, lr_balanced, lr_smote], key=lambda x: x['F1-Score'])
best_rf_approach = max([r for r in all_results if 'Random Forest' in r['Model']], key=lambda x: x['F1-Score'])

print(f"\n  For Logistic Regression:")
if 'Original' in best_lr_approach['Model']:
    print(f"    • Use original data without rebalancing")
    print(f"    • Consider threshold optimization instead")
elif 'Balanced' in best_lr_approach['Model']:
    print(f"    • Use class_weight='balanced' parameter")
    print(f"    • This handles imbalance without synthetic data")
else:
    print(f"    • SMOTE works best for this model")

print(f"\n  For Random Forest:")
if 'Original' in best_rf_approach['Model']:
    print(f"    • Use original data - Random Forest handles imbalance well")
elif 'Balanced' in best_rf_approach['Model']:
    print(f"    • Use class_weight='balanced' parameter")
else:
    print(f"    • SMOTE provides the best results")

print(f"\n  General Recommendations:")
print(f"    • Deploy: {best_result['Model']}")
print(f"    • Focus on top {min(10, len(selected_features))} most important features")
if 'best_threshold' in locals() and abs(best_threshold - 0.5) > 0.05:
    print(f"    • Use optimized threshold: {best_threshold:.3f} instead of default 0.5")
print(f"    • Implement real-time monitoring for new fraud patterns")
print(f"    • Regular model retraining recommended")
print(f"    • Consider ensemble methods for further improvement")

print(f"\n📈 BUSINESS IMPACT:")
fraud_amount_avg = df[df['Class'] == 1]['Amount'].mean()
potential_savings = best_result['TP'] * fraud_amount_avg
missed_losses = best_result['FN'] * fraud_amount_avg
print(f"  • Average fraud amount: ${fraud_amount_avg:.2f}")
print(f"  • Potential fraud prevented: ${potential_savings:,.2f}")
print(f"  • Estimated missed losses: ${missed_losses:,.2f}")
print(f"  • Model effectiveness: {potential_savings/(potential_savings+missed_losses)*100:.1f}%")

if best_result['FN'] > 0:
    print(f"\n⚠️ IMPORTANT: {best_result['FN']} fraud cases were missed")
    print(f"    Consider adjusting threshold to increase recall if business cost of missing fraud is high")

## 14. Why SMOTE May Hurt Performance

In [None]:
# Analysis of why SMOTE might hurt performance
print("ANALYSIS: Why SMOTE May Reduce Performance in Fraud Detection")
print("=" * 65)

print("\n🔬 POSSIBLE EXPLANATIONS:")
print("\n1. REAL-WORLD DISTRIBUTION MATTERS:")
print("   • Credit card fraud is naturally rare (~0.17% of transactions)")
print("   • The model needs to learn this real-world distribution")
print("   • Artificial balancing can distort decision boundaries")

print("\n2. SYNTHETIC DATA QUALITY:")
print("   • SMOTE creates synthetic minority class samples")
print("   • These may not capture real fraud patterns accurately")
print("   • Could introduce noise that confuses the classifier")

print("\n3. MODEL-SPECIFIC EFFECTS:")
print("   • Logistic Regression is particularly sensitive to class distribution")
print("   • It learns probability estimates based on training data proportions")
print("   • Artificial balancing can lead to miscalibrated probabilities")

print("\n4. EVALUATION METRICS:")
print("   • F1-score balances precision and recall")
print("   • If SMOTE increases false positives significantly, F1 drops")
print("   • Original imbalanced data may naturally optimize for precision")

print("\n💡 BETTER ALTERNATIVES TO SMOTE:")
print("   • Use class_weight='balanced' in sklearn models")
print("   • Optimize decision threshold instead of rebalancing data")
print("   • Use cost-sensitive learning approaches")
print("   • Focus on improving feature engineering")
print("   • Consider ensemble methods that handle imbalance naturally")

# Show the performance drop quantitatively
lr_performance_drop = lr_original['F1-Score'] - lr_smote['F1-Score']
print(f"\n📉 PERFORMANCE IMPACT:")
print(f"   • Logistic Regression F1-Score drop with SMOTE: {lr_performance_drop:.3f}")
print(f"   • That's a {lr_performance_drop/lr_original['F1-Score']*100:.1f}% relative decrease!")

if lr_balanced['F1-Score'] > lr_smote['F1-Score']:
    improvement = lr_balanced['F1-Score'] - lr_smote['F1-Score']
    print(f"   • Class weights perform {improvement:.3f} points better than SMOTE")
    print(f"   • Recommendation: Use class_weight='balanced' instead of SMOTE")

In [None]:
# Save comprehensive results
results_summary = display_df_sorted.copy()
results_summary.to_csv('fraud_detection_comprehensive_results.csv', index=False)
print("Results saved to 'fraud_detection_comprehensive_results.csv'")

# Save feature selection results
feature_df.to_csv('selected_features.csv', index=False)
print("Selected features saved to 'selected_features.csv'")

# Save feature importance if available
if 'feature_importance' in locals():
    feature_importance.to_csv('feature_importance.csv', index=False)
    print("Feature importance saved to 'feature_importance.csv'")

# Save threshold optimization results
if 'threshold_df' in locals():
    threshold_df.to_csv('threshold_optimization.csv', index=False)
    print("Threshold optimization results saved to 'threshold_optimization.csv'")

print("\n✅ Comprehensive analysis completed successfully!")
print("\n📋 KEY TAKEAWAYS:")
print(f"  • Best approach: {best_result['Model']}")
print(f"  • Best F1-Score: {best_result['F1-Score']:.4f}")
if 'best_f1_optimized' in locals() and best_f1_optimized > best_result['F1-Score']:
    print(f"  • With threshold optimization: {best_f1_optimized:.4f}")
print(f"  • {best_result['TP']} out of {best_result['TP']+best_result['FN']} fraud cases detected")
print(f"  • {best_result['FP']} false alarms out of {best_result['TN']+best_result['FP']} normal transactions")
print(f"  • SMOTE may not always improve performance - test alternatives!")
print(f"  • Model ready for deployment with proper threshold and monitoring")