# Data Exploration for Financial Fraud Detection

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from src.utils.preprocessing import DataPreprocessor

# Load data
try:
    df = joblib.load('data/processed/creditcard_raw.joblib')
except FileNotFoundError:
    from src.utils.data_loader import DataLoader
    loader = DataLoader()
    df = loader.load_creditcard_data()

# Basic info
print(f"Dataset shape: {df.shape}")
print(f"Fraud rate: {df['Class'].mean()*100:.2f}%")
df.head()

## Class Distribution

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='Class', data=df)
plt.title('Class Distribution (0: Legitimate, 1: Fraud)')
plt.show()

## Transaction Amount Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Distribution of transaction amounts
sns.histplot(df['Amount'], bins=50, ax=axes[0])
axes[0].set_title('Distribution of Transaction Amounts')
axes[0].set_xlabel('Amount')
axes[0].set_ylabel('Count')

# Transaction amounts by class
sns.boxplot(x='Class', y='Amount', data=df, ax=axes[1])
axes[1].set_title('Transaction Amounts by Class')
axes[1].set_yscale('log')

plt.tight_layout()
plt.show()

## Time Analysis

In [None]:
# Convert Time to hours
df['Time_hour'] = df['Time'] % (24 * 3600) // 3600

plt.figure(figsize=(12, 5))
sns.histplot(data=df, x='Time_hour', hue='Class', bins=24, multiple='stack')
plt.title('Transaction Frequency by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Count')
plt.show()

## Correlation Analysis

In [None]:
# Calculate correlations with Class
correlations = df.corr()['Class'].sort_values(ascending=False)

# Plot top correlations
plt.figure(figsize=(10, 8))
sns.barplot(x=correlations[1:11].values, y=correlations[1:11].index)
plt.title('Top 10 Features Correlated with Fraud')
plt.xlabel('Correlation Coefficient')
plt.show()

## Feature Distributions

In [None]:
# Plot distributions of PCA components
fig, axes = plt.subplots(5, 2, figsize=(15, 20))
axes = axes.flatten()

for i, col in enumerate(['V%d' % i for i in range(1, 11)]):
    sns.kdeplot(data=df, x=col, hue='Class', ax=axes[i], common_norm=False)
    axes[i].set_title(f'Distribution of {col}')

plt.tight_layout()
plt.show()