# Credit Card Fraud Detection

## Objective: Identify Fraudulent Transactions

### Challenge: Imbalanced Dataset (99.8% Normal, 0.2% Fraud)

### Models:
- Logistic Regression
- Random Forest
- XGBoost

### Solutions for Imbalanced Data:
- SMOTE (Synthetic Minority Over-sampling Technique)
- Random Undersampling
- Class Weights

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('Set2')

# Import custom modules
import sys
sys.path.append('..')

from src.data_loader import load_data, preprocess_data, split_data, get_data_summary
from src.imbalance_handlers import apply_smote, apply_undersampling, get_class_weights, apply_smote_tomek
from src.models import train_logistic_regression, train_random_forest, train_xgboost, evaluate_model, compare_models
from src.visualization import (plot_class_distribution, plot_confusion_matrix, plot_roc_curves,
                               plot_precision_recall_curves, plot_feature_importance, 
                               plot_metrics_comparison, plot_sampling_comparison)

print("Libraries imported successfully!")

## 2. Load and Explore Data

In [None]:
# Load the dataset
# Download from: https://www.kaggle.com/mlg-ulb/creditcardfraud
# Place creditcard.csv in the data folder

DATA_PATH = '../data/creditcard.csv'

try:
    df = load_data(DATA_PATH)
except FileNotFoundError:
    print("Please download the dataset from Kaggle and place it in the data folder.")
    print("URL: https://www.kaggle.com/mlg-ulb/creditcardfraud")

In [None]:
# Display basic info
print("Dataset Shape:", df.shape)
print("\nColumn Types:")
print(df.dtypes)
print("\nFirst 5 rows:")
df.head()

In [None]:
# Get data summary
summary = get_data_summary(df)
print("\n" + "="*50)
print("DATASET SUMMARY")
print("="*50)
print(f"Total Transactions: {summary['total_transactions']:,}")
print(f"Normal Transactions: {summary['normal_transactions']:,} ({summary['normal_percentage']:.2f}%)")
print(f"Fraud Transactions: {summary['fraud_transactions']:,} ({summary['fraud_percentage']:.2f}%)")
print(f"Imbalance Ratio: {summary['imbalance_ratio']:.0f}:1")
print(f"Missing Values: {summary['missing_values']}")

In [None]:
# Statistical description
df.describe()

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Plot class distribution
fig = plot_class_distribution(df['Class'], "Transaction Class Distribution")
plt.show()

In [None]:
# Distribution of Amount by Class
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Normal transactions
df[df['Class'] == 0]['Amount'].hist(bins=50, ax=axes[0], color='#2ecc71', edgecolor='black')
axes[0].set_title('Amount Distribution - Normal Transactions')
axes[0].set_xlabel('Amount')
axes[0].set_ylabel('Frequency')

# Fraud transactions
df[df['Class'] == 1]['Amount'].hist(bins=50, ax=axes[1], color='#e74c3c', edgecolor='black')
axes[1].set_title('Amount Distribution - Fraud Transactions')
axes[1].set_xlabel('Amount')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Time distribution
fig, ax = plt.subplots(figsize=(12, 5))

ax.hist(df[df['Class'] == 0]['Time'], bins=50, alpha=0.7, label='Normal', color='#2ecc71')
ax.hist(df[df['Class'] == 1]['Time'], bins=50, alpha=0.7, label='Fraud', color='#e74c3c')
ax.set_title('Transaction Time Distribution')
ax.set_xlabel('Time (seconds)')
ax.set_ylabel('Frequency')
ax.legend()
plt.show()

In [None]:
# Correlation heatmap for selected features
fig, ax = plt.subplots(figsize=(16, 12))
corr_matrix = df.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, ax=ax, cbar_kws={'shrink': 0.8})
ax.set_title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

In [None]:
# Correlation with target variable
corr_with_class = df.corr()['Class'].drop('Class').sort_values(key=abs, ascending=False)

fig, ax = plt.subplots(figsize=(10, 8))
colors = ['#e74c3c' if x < 0 else '#2ecc71' for x in corr_with_class.values]
corr_with_class.plot(kind='barh', ax=ax, color=colors, edgecolor='black')
ax.set_title('Feature Correlation with Fraud (Class)')
ax.set_xlabel('Correlation Coefficient')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

## 4. Data Preprocessing

In [None]:
# Preprocess data (scale Amount and Time)
df_processed = preprocess_data(df, scale_amount=True, scale_time=True)
print("Processed features:", list(df_processed.columns))

In [None]:
# Split data
X_train, X_test, y_train, y_test = split_data(df_processed, target_col='Class', test_size=0.2, random_state=42)

## 5. Handling Imbalanced Data

### 5.1 SMOTE (Synthetic Minority Over-sampling Technique)

In [None]:
# Apply SMOTE
X_train_smote, y_train_smote = apply_smote(X_train, y_train, random_state=42)

### 5.2 Random Undersampling

In [None]:
# Apply Undersampling
X_train_under, y_train_under = apply_undersampling(X_train, y_train, random_state=42)

### 5.3 Class Weights

In [None]:
# Calculate class weights
class_weights = get_class_weights(y_train, weight_type='balanced')

In [None]:
# Compare sampling methods
sampling_results = {
    'Original': (X_train, y_train),
    'SMOTE': (X_train_smote, y_train_smote),
    'Undersampling': (X_train_under, y_train_under)
}

fig = plot_sampling_comparison(sampling_results)
plt.show()

## 6. Model Training

### 6.1 Baseline Models (No Resampling)

In [None]:
# Train baseline models (no resampling, no class weights)
print("Training Baseline Models (Original Data)...\n")

lr_baseline = train_logistic_regression(X_train, y_train)
rf_baseline = train_random_forest(X_train, y_train)
xgb_baseline = train_xgboost(X_train, y_train, scale_pos_weight=1)  # No weight adjustment

print("Baseline models trained!")

In [None]:
# Evaluate baseline models
baseline_results = {
    'LR (Baseline)': evaluate_model(lr_baseline, X_test, y_test, 'Logistic Regression (Baseline)'),
    'RF (Baseline)': evaluate_model(rf_baseline, X_test, y_test, 'Random Forest (Baseline)'),
    'XGB (Baseline)': evaluate_model(xgb_baseline, X_test, y_test, 'XGBoost (Baseline)')
}

### 6.2 Models with SMOTE

In [None]:
# Train models on SMOTE data
print("Training Models with SMOTE...\n")

lr_smote = train_logistic_regression(X_train_smote, y_train_smote)
rf_smote = train_random_forest(X_train_smote, y_train_smote)
xgb_smote = train_xgboost(X_train_smote, y_train_smote, scale_pos_weight=1)

print("SMOTE models trained!")

In [None]:
# Evaluate SMOTE models
smote_results = {
    'LR (SMOTE)': evaluate_model(lr_smote, X_test, y_test, 'Logistic Regression (SMOTE)'),
    'RF (SMOTE)': evaluate_model(rf_smote, X_test, y_test, 'Random Forest (SMOTE)'),
    'XGB (SMOTE)': evaluate_model(xgb_smote, X_test, y_test, 'XGBoost (SMOTE)')
}

### 6.3 Models with Undersampling

In [None]:
# Train models on Undersampled data
print("Training Models with Undersampling...\n")

lr_under = train_logistic_regression(X_train_under, y_train_under)
rf_under = train_random_forest(X_train_under, y_train_under)
xgb_under = train_xgboost(X_train_under, y_train_under, scale_pos_weight=1)

print("Undersampling models trained!")

In [None]:
# Evaluate Undersampling models
under_results = {
    'LR (Under)': evaluate_model(lr_under, X_test, y_test, 'Logistic Regression (Undersampling)'),
    'RF (Under)': evaluate_model(rf_under, X_test, y_test, 'Random Forest (Undersampling)'),
    'XGB (Under)': evaluate_model(xgb_under, X_test, y_test, 'XGBoost (Undersampling)')
}

### 6.4 Models with Class Weights

In [None]:
# Train models with class weights
print("Training Models with Class Weights...\n")

lr_weighted = train_logistic_regression(X_train, y_train, class_weight='balanced')
rf_weighted = train_random_forest(X_train, y_train, class_weight='balanced')
xgb_weighted = train_xgboost(X_train, y_train)  # Auto-calculates scale_pos_weight

print("Weighted models trained!")

In [None]:
# Evaluate Weighted models
weighted_results = {
    'LR (Weighted)': evaluate_model(lr_weighted, X_test, y_test, 'Logistic Regression (Weighted)'),
    'RF (Weighted)': evaluate_model(rf_weighted, X_test, y_test, 'Random Forest (Weighted)'),
    'XGB (Weighted)': evaluate_model(xgb_weighted, X_test, y_test, 'XGBoost (Weighted)')
}

## 7. Model Comparison

In [None]:
# Combine all results
all_results = {**baseline_results, **smote_results, **under_results, **weighted_results}

# Create comparison DataFrame
comparison_data = []
for name, metrics in all_results.items():
    comparison_data.append({
        'Model': name,
        'Accuracy': metrics['accuracy'],
        'Precision': metrics['precision'],
        'Recall': metrics['recall'],
        'F1-Score': metrics['f1_score'],
        'ROC-AUC': metrics['roc_auc'],
        'Avg Precision': metrics['avg_precision']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('F1-Score', ascending=False)
print("\nModel Comparison (Sorted by F1-Score):")
comparison_df

In [None]:
# Plot metrics comparison for best models (one from each approach)
best_models = {
    'XGB (Baseline)': baseline_results['XGB (Baseline)'],
    'XGB (SMOTE)': smote_results['XGB (SMOTE)'],
    'XGB (Under)': under_results['XGB (Under)'],
    'XGB (Weighted)': weighted_results['XGB (Weighted)']
}

fig = plot_metrics_comparison(best_models)
plt.show()

In [None]:
# ROC Curves comparison
fig = plot_roc_curves(best_models, y_test)
plt.show()

In [None]:
# Precision-Recall Curves comparison
fig = plot_precision_recall_curves(best_models, y_test)
plt.show()

## 8. Best Model Analysis

In [None]:
# Find best model based on F1-Score
best_model_name = comparison_df.iloc[0]['Model']
print(f"Best Model: {best_model_name}")
print(f"F1-Score: {comparison_df.iloc[0]['F1-Score']:.4f}")
print(f"Recall: {comparison_df.iloc[0]['Recall']:.4f}")
print(f"ROC-AUC: {comparison_df.iloc[0]['ROC-AUC']:.4f}")

In [None]:
# Confusion Matrix for best model
best_results = all_results[best_model_name]
fig = plot_confusion_matrix(y_test, best_results['y_pred'], f"Confusion Matrix - {best_model_name}")
plt.show()

In [None]:
# Feature importance (for XGBoost model)
feature_names = X_train.columns.tolist()
fig = plot_feature_importance(xgb_weighted, feature_names, top_n=20, 
                              title="Feature Importance - XGBoost (Weighted)")
plt.show()

## 9. Summary and Conclusions

In [None]:
print("="*60)
print("CREDIT CARD FRAUD DETECTION - SUMMARY")
print("="*60)
print("\n1. DATASET CHARACTERISTICS:")
print(f"   - Total Transactions: {summary['total_transactions']:,}")
print(f"   - Fraud Rate: {summary['fraud_percentage']:.3f}%")
print(f"   - Imbalance Ratio: {summary['imbalance_ratio']:.0f}:1")

print("\n2. IMBALANCE HANDLING TECHNIQUES TESTED:")
print("   - SMOTE (Synthetic Minority Over-sampling)")
print("   - Random Undersampling")
print("   - Class Weights")

print("\n3. MODELS COMPARED:")
print("   - Logistic Regression")
print("   - Random Forest")
print("   - XGBoost")

print("\n4. KEY FINDINGS:")
print(f"   - Best Model: {best_model_name}")
print(f"   - Best F1-Score: {comparison_df.iloc[0]['F1-Score']:.4f}")
print(f"   - Best Recall: {comparison_df.iloc[0]['Recall']:.4f}")

print("\n5. RECOMMENDATIONS:")
print("   - For fraud detection, prioritize RECALL to catch more frauds")
print("   - Use class weights or SMOTE for better minority class detection")
print("   - XGBoost with class weights provides best balance")
print("   - Consider business cost of false positives vs false negatives")
print("="*60)

In [None]:
# Display final comparison table
print("\nFINAL MODEL COMPARISON:")
comparison_df.style.background_gradient(cmap='RdYlGn', subset=['F1-Score', 'Recall', 'ROC-AUC'])