# Exploratory Data Analysis (EDA)
# Fraud Detection Dataset

This notebook explores the synthetic transaction dataset to understand:
- Data distributions
- Class imbalance
- Feature correlations
- Fraud patterns

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('Libraries imported successfully!')

## 1. Load and Inspect Data

In [None]:
# Load dataset
df = pd.read_csv('data/raw/transactions.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head(10)

In [None]:
# Basic statistics
df.describe()

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Data types
print("\nData types:")
print(df.dtypes)

## 2. Target Variable Analysis

In [None]:
# Class distribution
fraud_counts = df['is_fraud'].value_counts()
fraud_pct = df['is_fraud'].value_counts(normalize=True) * 100

print("Fraud Distribution:")
print(f"Legitimate: {fraud_counts[0]:,} ({fraud_pct[0]:.2f}%)")
print(f"Fraud: {fraud_counts[1]:,} ({fraud_pct[1]:.2f}%)")
print(f"\nImbalance Ratio: 1:{fraud_counts[0] // fraud_counts[1]}")

In [None]:
# Visualize class distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
fraud_counts.plot(kind='bar', ax=ax1, color=['steelblue', 'coral'])
ax1.set_title('Transaction Class Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Class (0=Legitimate, 1=Fraud)', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax1.set_xticklabels(['Legitimate', 'Fraud'], rotation=0)

# Pie chart
ax2.pie(fraud_counts, labels=['Legitimate', 'Fraud'], autopct='%1.2f%%',
        colors=['steelblue', 'coral'], startangle=90)
ax2.set_title('Class Distribution Proportion', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 3. Numerical Features Analysis

In [None]:
# Amount distribution by fraud status
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Amount distribution
axes[0, 0].hist(df[df['is_fraud']==0]['amount_usd'], bins=50, alpha=0.6, label='Legitimate', color='steelblue')
axes[0, 0].hist(df[df['is_fraud']==1]['amount_usd'], bins=50, alpha=0.6, label='Fraud', color='coral')
axes[0, 0].set_xlabel('Amount (USD)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Transaction Amount Distribution by Class')
axes[0, 0].legend()

# Customer age
axes[0, 1].hist(df[df['is_fraud']==0]['customer_age_days'], bins=50, alpha=0.6, label='Legitimate', color='steelblue')
axes[0, 1].hist(df[df['is_fraud']==1]['customer_age_days'], bins=50, alpha=0.6, label='Fraud', color='coral')
axes[0, 1].set_xlabel('Customer Age (days)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Customer Age Distribution by Class')
axes[0, 1].legend()

# Transaction count
axes[1, 0].hist(df[df['is_fraud']==0]['customer_txn_30d'], bins=30, alpha=0.6, label='Legitimate', color='steelblue')
axes[1, 0].hist(df[df['is_fraud']==1]['customer_txn_30d'], bins=30, alpha=0.6, label='Fraud', color='coral')
axes[1, 0].set_xlabel('Transactions (30 days)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Transaction Count Distribution by Class')
axes[1, 0].legend()

# Hour of day
axes[1, 1].hist(df[df['is_fraud']==0]['hour_of_day'], bins=24, alpha=0.6, label='Legitimate', color='steelblue')
axes[1, 1].hist(df[df['is_fraud']==1]['hour_of_day'], bins=24, alpha=0.6, label='Fraud', color='coral')
axes[1, 1].set_xlabel('Hour of Day')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Transaction Time Distribution by Class')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Box plots for key features
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

df.boxplot(column='amount_usd', by='is_fraud', ax=axes[0])
axes[0].set_title('Amount by Fraud Status')
axes[0].set_xlabel('Is Fraud')
axes[0].set_ylabel('Amount (USD)')

df.boxplot(column='customer_age_days', by='is_fraud', ax=axes[1])
axes[1].set_title('Customer Age by Fraud Status')
axes[1].set_xlabel('Is Fraud')
axes[1].set_ylabel('Age (days)')

df.boxplot(column='customer_txn_30d', by='is_fraud', ax=axes[2])
axes[2].set_title('Txn Count by Fraud Status')
axes[2].set_xlabel('Is Fraud')
axes[2].set_ylabel('Transactions (30d)')

plt.suptitle('')
plt.tight_layout()
plt.show()

## 4. Categorical Features Analysis

In [None]:
# Fraud rate by categorical features
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Merchant category
category_fraud = df.groupby('merchant_category')['is_fraud'].mean().sort_values(ascending=False)
category_fraud.plot(kind='bar', ax=axes[0, 0], color='coral')
axes[0, 0].set_title('Fraud Rate by Merchant Category')
axes[0, 0].set_ylabel('Fraud Rate')
axes[0, 0].tick_params(axis='x', rotation=45)

# Channel
channel_fraud = df.groupby('channel')['is_fraud'].mean().sort_values(ascending=False)
channel_fraud.plot(kind='bar', ax=axes[0, 1], color='steelblue')
axes[0, 1].set_title('Fraud Rate by Channel')
axes[0, 1].set_ylabel('Fraud Rate')
axes[0, 1].tick_params(axis='x', rotation=45)

# Card present
card_fraud = df.groupby('card_present')['is_fraud'].mean()
card_fraud.plot(kind='bar', ax=axes[1, 0], color='green')
axes[1, 0].set_title('Fraud Rate by Card Present')
axes[1, 0].set_ylabel('Fraud Rate')
axes[1, 0].set_xticklabels(['Not Present', 'Present'], rotation=0)

# Country mismatch
country_fraud = df.groupby('country_mismatch')['is_fraud'].mean()
country_fraud.plot(kind='bar', ax=axes[1, 1], color='purple')
axes[1, 1].set_title('Fraud Rate by Country Mismatch')
axes[1, 1].set_ylabel('Fraud Rate')
axes[1, 1].set_xticklabels(['No Mismatch', 'Mismatch'], rotation=0)

plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Select numerical columns
num_cols = ['amount_usd', 'customer_age_days', 'customer_txn_30d', 
            'avg_amount_30d', 'std_amount_30d', 'card_present', 
            'country_mismatch', 'hour_of_day', 'is_weekend', 'is_fraud']

# Correlation matrix
corr_matrix = df[num_cols].corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Correlation with target variable
target_corr = corr_matrix['is_fraud'].drop('is_fraud').sort_values(ascending=False)

plt.figure(figsize=(10, 6))
target_corr.plot(kind='barh', color='steelblue')
plt.title('Feature Correlation with Fraud', fontsize=14, fontweight='bold')
plt.xlabel('Correlation Coefficient')
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
plt.tight_layout()
plt.show()

print("\nTop correlated features with fraud:")
print(target_corr)

## 6. Key Insights

### Fraud Patterns Identified:

1. **Amount**: Fraudulent transactions tend to have higher amounts
2. **Country Mismatch**: Strong indicator of fraud risk
3. **Card Not Present**: Higher fraud rate in CNP transactions
4. **New Accounts**: Younger accounts show higher fraud rates
5. **Time of Day**: Late night/early morning transactions more suspicious
6. **Merchant Category**: Electronics and travel show elevated fraud rates

### Data Quality:
- No missing values
- Highly imbalanced dataset (~1.5% fraud)
- Reasonable distributions across all features
- Suitable for ML modeling with imbalance handling

## 7. Next Steps

1. Feature engineering (deviation from averages, velocity features)
2. Model training with imbalance handling
3. SHAP analysis for explainability
4. Threshold optimization for business metrics