# Fraud Detection - Exploratory Data Analysis

This notebook explores the credit card fraud dataset and prepares features for modeling.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Settings
pd.set_option('display.max_columns', 50)
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1. Load Data

In [None]:
# Load dataset
data_path = Path('../data/raw/creditcard.csv')

if data_path.exists():
    df = pd.read_csv(data_path)
    print(f'Dataset shape: {df.shape}')
    print(f'Columns: {df.columns.tolist()}')
else:
    print('Dataset not found. Please download from Kaggle.')
    print('Run: python scripts/download_data.py')

In [None]:
# Basic info
df.info()

In [None]:
# First rows
df.head()

## 2. Class Distribution

In [None]:
# Class distribution
print('Class Distribution:')
print(df['Class'].value_counts())
print(f'\nFraud ratio: {df["Class"].mean():.4%}')

In [None]:
# Visualize class imbalance
fig, ax = plt.subplots(figsize=(8, 5))
df['Class'].value_counts().plot(kind='bar', ax=ax, color=['steelblue', 'crimson'])
ax.set_title('Class Distribution (Highly Imbalanced)', fontsize=14)
ax.set_xlabel('Class (0=Legitimate, 1=Fraud)')
ax.set_ylabel('Count')
ax.set_xticklabels(['Legitimate', 'Fraud'], rotation=0)
plt.tight_layout()
plt.show()

## 3. Feature Analysis

In [None]:
# Statistical summary
df.describe()

In [None]:
# Amount distribution by class
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Amount distribution
axes[0].hist(df[df['Class']==0]['Amount'], bins=50, alpha=0.7, label='Legitimate', color='steelblue')
axes[0].hist(df[df['Class']==1]['Amount'], bins=50, alpha=0.7, label='Fraud', color='crimson')
axes[0].set_title('Amount Distribution by Class')
axes[0].set_xlabel('Amount')
axes[0].legend()

# Log Amount
axes[1].hist(np.log1p(df[df['Class']==0]['Amount']), bins=50, alpha=0.7, label='Legitimate', color='steelblue')
axes[1].hist(np.log1p(df[df['Class']==1]['Amount']), bins=50, alpha=0.7, label='Fraud', color='crimson')
axes[1].set_title('Log(Amount) Distribution by Class')
axes[1].set_xlabel('Log(Amount)')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Time distribution
fig, ax = plt.subplots(figsize=(12, 5))

df['Hour'] = (df['Time'] / 3600) % 24

df[df['Class']==0]['Hour'].hist(bins=24, alpha=0.7, label='Legitimate', ax=ax, color='steelblue')
df[df['Class']==1]['Hour'].hist(bins=24, alpha=0.7, label='Fraud', ax=ax, color='crimson')

ax.set_title('Transaction Time Distribution (Hour of Day)')
ax.set_xlabel('Hour')
ax.set_ylabel('Count')
ax.legend()
plt.tight_layout()
plt.show()

## 4. Correlation Analysis

In [None]:
# Correlation with target
correlations = df.corr()['Class'].drop('Class').sort_values()

fig, ax = plt.subplots(figsize=(10, 8))
correlations.plot(kind='barh', ax=ax, color=['crimson' if x < 0 else 'steelblue' for x in correlations])
ax.set_title('Feature Correlation with Fraud (Class)')
ax.set_xlabel('Correlation')
plt.tight_layout()
plt.show()

In [None]:
# Top correlated features
print('Most positively correlated with Fraud:')
print(correlations.tail(5))
print('\nMost negatively correlated with Fraud:')
print(correlations.head(5))

## 5. Key Insights

1. **Highly Imbalanced Dataset**: Only ~0.17% of transactions are fraudulent
2. **PCA Features**: V1-V28 are PCA transformed (privacy protected)
3. **Important Features**: V14, V17, V12, V10 show strong correlation with fraud
4. **Amount**: Fraud transactions tend to have different amount patterns
5. **Time**: Some time patterns visible in fraud vs legitimate

In [None]:
print('EDA Complete!')