# Fraud Detection - Exploratory Data Analysis (EDA)

This notebook contains exploratory analysis only. It is NOT required for training or inference.

**Purpose:** Understand data distributions, correlations, class balance, and feature relationships.

**Author:** Cristhian Acosta  
**Date:** February 2026

## 1. Setup and Data Loading

In [None]:
import sys
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
np.random.seed(42)

# Add src to path for imports
sys.path.append(str(Path.cwd().parent))

from src.data import prepare_dataset

print("Setup complete")

In [None]:
# Load and process data using production pipeline
DATA_PATH = "../fraud-detection-mlops/data/raw/onlinefraud.csv"

X, y = prepare_dataset(DATA_PATH)

print(f"\nDataset Shape:")
print(f"  Features: {X.shape}")
print(f"  Target: {y.shape}")
print(f"\nClass Distribution:")
print(y.value_counts())
print(f"  Fraud rate: {y.mean():.4%}")

## 2. Class Balance Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
counts = y.value_counts()
axes[0].bar(['Legitimate', 'Fraud'], counts.values, color=['green', 'red'], alpha=0.7)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_title('Class Distribution (Absolute)', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Percentage plot
axes[1].pie(counts.values, labels=['Legitimate', 'Fraud'], autopct='%1.4f%%',
           colors=['green', 'red'], startangle=90)
axes[1].set_title('Class Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\nDataset is highly imbalanced: {y.mean():.4%} fraud")

## 3. Feature Statistics

In [None]:
print("Feature Statistics:\n")
print(X.describe().T)

print(f"\nMissing Values:")
missing = X.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("  No missing values found")

print(f"\nFeature Types:")
print(X.dtypes.value_counts())

## 4. Feature Distributions

In [None]:
# Select key numerical features for visualization
key_features = ['amount', 'log1p_amount', 'balance_change_orig', 
                'balance_change_dest', 'hour_of_day', 'day_of_month']

existing_features = [f for f in key_features if f in X.columns]

if len(existing_features) >= 6:
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.flatten()
    
    for i, feature in enumerate(existing_features[:6]):
        axes[i].hist(X[feature], bins=50, alpha=0.7, edgecolor='black')
        axes[i].set_xlabel(feature, fontsize=11)
        axes[i].set_ylabel('Frequency', fontsize=11)
        axes[i].set_title(f'{feature} Distribution', fontsize=12, fontweight='bold')
        axes[i].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 5. Feature Distributions by Class

In [None]:
# Sample data for visualization (to avoid memory issues)
sample_size = min(10000, len(X))
sample_idx = np.random.choice(len(X), sample_size, replace=False)
X_sample = X.iloc[sample_idx]
y_sample = y.iloc[sample_idx]

# Combine for easier plotting
df_sample = X_sample.copy()
df_sample['isFraud'] = y_sample.values

# Select features for comparison
compare_features = ['log1p_amount', 'balance_change_orig', 'emptied_account', 'is_night']
existing_compare = [f for f in compare_features if f in df_sample.columns]

if len(existing_compare) >= 4:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.flatten()
    
    for i, feature in enumerate(existing_compare[:4]):
        df_sample[df_sample['isFraud'] == 0][feature].hist(
            ax=axes[i], bins=30, alpha=0.5, label='Legitimate', color='green'
        )
        df_sample[df_sample['isFraud'] == 1][feature].hist(
            ax=axes[i], bins=30, alpha=0.5, label='Fraud', color='red'
        )
        axes[i].set_xlabel(feature, fontsize=11)
        axes[i].set_ylabel('Frequency', fontsize=11)
        axes[i].set_title(f'{feature} by Class', fontsize=12, fontweight='bold')
        axes[i].legend()
        axes[i].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 6. Correlation Matrix

In [None]:
# Compute correlation on numeric features only
numeric_features = X_sample.select_dtypes(include=[np.number]).columns.tolist()

# Limit to reasonable number for visualization
if len(numeric_features) > 20:
    # Select most important features based on correlation with key variables
    key_vars = ['amount', 'balance_change_orig', 'balance_change_dest', 
                'inconsistent_orig', 'inconsistent_dest']
    existing_key = [v for v in key_vars if v in numeric_features]
    
    if existing_key:
        corr_full = X_sample[numeric_features].corr()
        # Get top correlated with key variables
        top_features = set(existing_key)
        for key_var in existing_key:
            if key_var in corr_full.columns:
                top_corr = corr_full[key_var].abs().nlargest(8).index.tolist()
                top_features.update(top_corr)
        numeric_features = list(top_features)[:20]

corr_matrix = X_sample[numeric_features].corr()

plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, 
           square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Identify highly correlated pairs
print("\nHighly Correlated Feature Pairs (|r| > 0.9):\n")
high_corr = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.9:
            high_corr.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

if high_corr:
    for feat1, feat2, corr_val in high_corr:
        print(f"  {feat1:30s} <-> {feat2:30s}  r={corr_val:6.3f}")
else:
    print("  No pairs with |r| > 0.9 found")

## 7. Transaction Type Analysis

In [None]:
if 'type' in X.columns:
    df_type = X[['type']].copy()
    df_type['isFraud'] = y.values
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Type distribution
    type_counts = df_type['type'].value_counts()
    axes[0].bar(type_counts.index, type_counts.values, alpha=0.7)
    axes[0].set_xlabel('Transaction Type', fontsize=12)
    axes[0].set_ylabel('Count', fontsize=12)
    axes[0].set_title('Transaction Type Distribution', fontsize=14, fontweight='bold')
    axes[0].tick_params(axis='x', rotation=45)
    axes[0].grid(True, alpha=0.3)
    
    # Fraud rate by type
    fraud_rate_by_type = df_type.groupby('type')['isFraud'].mean()
    axes[1].bar(fraud_rate_by_type.index, fraud_rate_by_type.values * 100, 
               color='red', alpha=0.7)
    axes[1].set_xlabel('Transaction Type', fontsize=12)
    axes[1].set_ylabel('Fraud Rate (%)', fontsize=12)
    axes[1].set_title('Fraud Rate by Transaction Type', fontsize=14, fontweight='bold')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("\nFraud Rate by Type:")
    print(fraud_rate_by_type.to_string())
else:
    print("⚠️  'type' column not found")

## 8. Temporal Patterns

In [None]:
if 'hour_of_day' in X.columns and 'day_of_month' in X.columns:
    df_temporal = X[['hour_of_day', 'day_of_month']].copy()
    df_temporal['isFraud'] = y.values
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 5))
    
    # Fraud by hour
    fraud_by_hour = df_temporal.groupby('hour_of_day')['isFraud'].mean() * 100
    axes[0].plot(fraud_by_hour.index, fraud_by_hour.values, marker='o', linewidth=2)
    axes[0].set_xlabel('Hour of Day', fontsize=12)
    axes[0].set_ylabel('Fraud Rate (%)', fontsize=12)
    axes[0].set_title('Fraud Rate by Hour of Day', fontsize=14, fontweight='bold')
    axes[0].grid(True, alpha=0.3)
    
    # Fraud by day
    fraud_by_day = df_temporal.groupby('day_of_month')['isFraud'].mean() * 100
    axes[1].plot(fraud_by_day.index, fraud_by_day.values, marker='o', linewidth=2, color='orange')
    axes[1].set_xlabel('Day of Month', fontsize=12)
    axes[1].set_ylabel('Fraud Rate (%)', fontsize=12)
    axes[1].set_title('Fraud Rate by Day of Month', fontsize=14, fontweight='bold')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 9. Key Feature Insights

In [None]:
# Analyze key fraud indicators
indicators = ['emptied_account', 'inconsistent_orig', 'inconsistent_dest', 
              'dest_account_new', 'amount_is_round']
existing_indicators = [ind for ind in indicators if ind in X.columns]

if existing_indicators:
    df_indicators = X[existing_indicators].copy()
    df_indicators['isFraud'] = y.values
    
    print("Fraud Rate by Key Indicators:\n")
    for indicator in existing_indicators:
        fraud_rate = df_indicators.groupby(indicator)['isFraud'].mean()
        print(f"\n{indicator}:")
        for val, rate in fraud_rate.items():
            print(f"  {val}: {rate:.4%}")

## 10. Correlation with Target

In [None]:
# Compute correlation with target
df_with_target = X_sample.copy()
df_with_target['isFraud'] = y_sample.values

numeric_cols = df_with_target.select_dtypes(include=[np.number]).columns
target_corr = df_with_target[numeric_cols].corrwith(df_with_target['isFraud']).sort_values(
    ascending=False
)

print("\nTop 15 Features Correlated with Fraud:\n")
print(target_corr.head(15).to_string())

print("\nBottom 10 Features (Negative Correlation):\n")
print(target_corr.tail(10).to_string())

# Visualize
plt.figure(figsize=(10, 8))
target_corr.drop('isFraud').sort_values().tail(20).plot(kind='barh', color='steelblue')
plt.xlabel('Correlation with Fraud', fontsize=12)
plt.title('Top 20 Features Correlated with Fraud', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 11. Summary and Recommendations

### Key Findings:

1. **Class Imbalance:** Dataset is highly imbalanced (~0.13% fraud) → Use SMOTE or class weights
2. **Feature Engineering:** Multiple derived features capture balance inconsistencies
3. **Temporal Patterns:** Fraud rates may vary by hour/day
4. **Transaction Types:** Certain types have higher fraud rates

### Next Steps:

- Train model with SMOTE or class weights
- Monitor AUC-PR (more relevant than AUC-ROC for imbalanced data)
- Consider removing highly correlated features if needed
- Use optimal threshold from threshold analysis