# Exploratory Data Analysis - Credit Card Fraud Detection

**Dataset:** creditcard.csv

**Objective:** Comprehensive exploratory analysis to understand:
- Dataset structure and characteristics
- Class distribution and imbalance
- Feature distributions and correlations
- Temporal patterns
- Statistical insights for fraud detection

## 1. Setup and Data Loading

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings

warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

print("‚úÖ Libraries imported successfully")

In [None]:
# Load dataset
df = pd.read_csv('../data/raw/creditcard.csv')
print(f"Dataset loaded: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
df.head()

## 2. Dataset Overview

In [None]:
# Basic information
print("="*80)
print("DATASET INFORMATION")
print("="*80)
df.info()

In [None]:
# Statistical summary
print("\n" + "="*80)
print("STATISTICAL SUMMARY")
print("="*80)
df.describe()

In [None]:
# Missing values check
missing = df.isnull().sum()
missing_pct = 100 * missing / len(df)
missing_table = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print("\n" + "="*80)
print("MISSING VALUES")
print("="*80)
print(missing_table[missing_table['Missing Count'] > 0])
if missing.sum() == 0:
    print("‚úÖ No missing values detected")

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicates:,}")
if duplicates == 0:
    print("‚úÖ No duplicates detected")

## 3. Class Distribution Analysis

In [None]:
# Class distribution
class_counts = df['Class'].value_counts()
class_pct = 100 * class_counts / len(df)
imbalance_ratio = class_counts[0] / class_counts[1]

print("="*80)
print("CLASS DISTRIBUTION")
print("="*80)
print(f"\nLegitimate (Class 0): {class_counts[0]:,} ({class_pct[0]:.2f}%)")
print(f"Fraudulent (Class 1): {class_counts[1]:,} ({class_pct[1]:.2f}%)")
print(f"\n‚ö†Ô∏è  Imbalance Ratio: {imbalance_ratio:.2f}:1")
print(f"\nThis is a highly imbalanced dataset!")

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
class_counts.plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'], alpha=0.8)
axes[0].set_title('Class Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Class (0=Legitimate, 1=Fraud)', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xticklabels(['Legitimate', 'Fraud'], rotation=0)
for i, v in enumerate(class_counts):
    axes[0].text(i, v + 5000, f'{v:,}\n({class_pct[i]:.2f}%)', 
                ha='center', va='bottom', fontsize=11, fontweight='bold')

# Pie chart
colors = ['#2ecc71', '#e74c3c']
explode = (0, 0.1)
axes[1].pie(class_counts, labels=['Legitimate', 'Fraud'], autopct='%1.2f%%',
           startangle=90, colors=colors, explode=explode, shadow=True)
axes[1].set_title('Class Distribution (Proportion)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 4. Temporal Analysis

In [None]:
# Time analysis - convert seconds to hours
df['Time_hours'] = df['Time'] / 3600

print("="*80)
print("TEMPORAL STATISTICS")
print("="*80)
print(f"\nTime range: {df['Time_hours'].min():.2f} to {df['Time_hours'].max():.2f} hours")
print(f"Duration: {df['Time_hours'].max():.2f} hours (~{df['Time_hours'].max()/24:.1f} days)")
print(f"\nTime statistics for Legitimate transactions:")
print(df[df['Class']==0]['Time_hours'].describe())
print(f"\nTime statistics for Fraudulent transactions:")
print(df[df['Class']==1]['Time_hours'].describe())

In [None]:
# Transaction distribution over time
fig, axes = plt.subplots(2, 1, figsize=(16, 10))

# Overall transaction timeline
axes[0].hist(df['Time_hours'], bins=100, color='#3498db', alpha=0.7, edgecolor='black')
axes[0].set_title('Transaction Distribution Over Time (All Transactions)', 
                  fontsize=14, fontweight='bold')
axes[0].set_xlabel('Time (hours)', fontsize=12)
axes[0].set_ylabel('Number of Transactions', fontsize=12)
axes[0].grid(True, alpha=0.3)

# Fraud vs Legitimate over time
axes[1].hist([df[df['Class']==0]['Time_hours'], 
              df[df['Class']==1]['Time_hours']], 
             bins=100, color=['#2ecc71', '#e74c3c'], 
             label=['Legitimate', 'Fraud'], alpha=0.7, edgecolor='black')
axes[1].set_title('Transaction Distribution by Class Over Time', 
                  fontsize=14, fontweight='bold')
axes[1].set_xlabel('Time (hours)', fontsize=12)
axes[1].set_ylabel('Number of Transactions', fontsize=12)
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Fraud rate over time bins
time_bins = pd.cut(df['Time_hours'], bins=48)  # 48 bins for ~1 hour each
fraud_rate_time = df.groupby(time_bins)['Class'].agg(['sum', 'count', 'mean'])
fraud_rate_time.columns = ['Fraud_Count', 'Total_Transactions', 'Fraud_Rate']

fig, ax = plt.subplots(figsize=(16, 6))
x_pos = range(len(fraud_rate_time))
ax.plot(x_pos, fraud_rate_time['Fraud_Rate'] * 100, marker='o', 
        linewidth=2, markersize=4, color='#e74c3c')
ax.fill_between(x_pos, fraud_rate_time['Fraud_Rate'] * 100, alpha=0.3, color='#e74c3c')
ax.set_title('Fraud Rate Over Time', fontsize=14, fontweight='bold')
ax.set_xlabel('Time Bins (~1 hour each)', fontsize=12)
ax.set_ylabel('Fraud Rate (%)', fontsize=12)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Amount Analysis

In [None]:
# Amount statistics by class
print("="*80)
print("AMOUNT STATISTICS BY CLASS")
print("="*80)
print("\nLegitimate Transactions (Class 0):")
print(df[df['Class']==0]['Amount'].describe())
print("\nFraudulent Transactions (Class 1):")
print(df[df['Class']==1]['Amount'].describe())

# Statistical test
legit_amounts = df[df['Class']==0]['Amount']
fraud_amounts = df[df['Class']==1]['Amount']
t_stat, p_value = stats.mannwhitneyu(legit_amounts, fraud_amounts, alternative='two-sided')
print(f"\nMann-Whitney U Test:")
print(f"  Test Statistic: {t_stat:.2f}")
print(f"  P-value: {p_value:.2e}")
if p_value < 0.05:
    print("  ‚úÖ Significant difference in amount distributions (p < 0.05)")
else:
    print("  ‚ùå No significant difference in amount distributions (p >= 0.05)")

In [None]:
# Amount distribution visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Histogram - All transactions
axes[0, 0].hist(df['Amount'], bins=50, color='#3498db', alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Amount Distribution (All Transactions)', fontsize=13, fontweight='bold')
axes[0, 0].set_xlabel('Amount ($)', fontsize=11)
axes[0, 0].set_ylabel('Frequency', fontsize=11)
axes[0, 0].grid(True, alpha=0.3)

# Log scale histogram
axes[0, 1].hist(np.log1p(df['Amount']), bins=50, color='#9b59b6', alpha=0.7, edgecolor='black')
axes[0, 1].set_title('Amount Distribution (Log Scale)', fontsize=13, fontweight='bold')
axes[0, 1].set_xlabel('Log(Amount + 1)', fontsize=11)
axes[0, 1].set_ylabel('Frequency', fontsize=11)
axes[0, 1].grid(True, alpha=0.3)

# Box plot by class
df.boxplot(column='Amount', by='Class', ax=axes[1, 0], 
           patch_artist=True, grid=True)
axes[1, 0].set_title('Amount Distribution by Class', fontsize=13, fontweight='bold')
axes[1, 0].set_xlabel('Class (0=Legitimate, 1=Fraud)', fontsize=11)
axes[1, 0].set_ylabel('Amount ($)', fontsize=11)
plt.suptitle('')  # Remove auto title

# Violin plot
parts = axes[1, 1].violinplot([legit_amounts, fraud_amounts], 
                              positions=[0, 1], showmeans=True, showmedians=True)
axes[1, 1].set_title('Amount Distribution (Violin Plot)', fontsize=13, fontweight='bold')
axes[1, 1].set_xlabel('Class (0=Legitimate, 1=Fraud)', fontsize=11)
axes[1, 1].set_ylabel('Amount ($)', fontsize=11)
axes[1, 1].set_xticks([0, 1])
axes[1, 1].set_xticklabels(['Legitimate', 'Fraud'])
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Amount distribution comparison (zoomed)
fig, ax = plt.subplots(figsize=(16, 6))

# Plot both distributions
ax.hist(legit_amounts, bins=100, alpha=0.6, label='Legitimate', 
        color='#2ecc71', edgecolor='black', density=True)
ax.hist(fraud_amounts, bins=100, alpha=0.6, label='Fraud', 
        color='#e74c3c', edgecolor='black', density=True)

ax.set_title('Amount Distribution Comparison (Normalized)', fontsize=14, fontweight='bold')
ax.set_xlabel('Amount ($)', fontsize=12)
ax.set_ylabel('Density', fontsize=12)
ax.set_xlim(0, 500)  # Zoom to see most of the data
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. PCA Features Analysis (V1-V28)

In [None]:
# Get V columns
v_columns = [col for col in df.columns if col.startswith('V')]
print(f"Number of PCA features: {len(v_columns)}")

# Statistics for V features
print("\n" + "="*80)
print("PCA FEATURES STATISTICS")
print("="*80)
print(df[v_columns].describe())

In [None]:
# Distribution of V features
fig, axes = plt.subplots(7, 4, figsize=(20, 24))
axes = axes.ravel()

for idx, col in enumerate(v_columns):
    axes[idx].hist(df[col], bins=50, color='#16a085', alpha=0.7, edgecolor='black')
    axes[idx].set_title(f'{col} Distribution', fontsize=11, fontweight='bold')
    axes[idx].set_xlabel(col, fontsize=9)
    axes[idx].set_ylabel('Frequency', fontsize=9)
    axes[idx].grid(True, alpha=0.3)

plt.suptitle('Distribution of All PCA Features (V1-V28)', 
             fontsize=16, fontweight='bold', y=1.0)
plt.tight_layout()
plt.show()

In [None]:
# V features comparison by class (sample of 6 features)
sample_v_features = v_columns[::5][:6]  # Sample every 5th feature, take first 6

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(sample_v_features):
    axes[idx].hist([df[df['Class']==0][col], df[df['Class']==1][col]], 
                   bins=50, alpha=0.7, label=['Legitimate', 'Fraud'],
                   color=['#2ecc71', '#e74c3c'], edgecolor='black', density=True)
    axes[idx].set_title(f'{col} by Class', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col, fontsize=10)
    axes[idx].set_ylabel('Density', fontsize=10)
    axes[idx].legend(fontsize=9)
    axes[idx].grid(True, alpha=0.3)

plt.suptitle('Sample PCA Features Distribution by Class', 
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 7. Correlation Analysis

In [None]:
# Correlation matrix for all features
correlation_matrix = df.corr()

# Correlation with target (Class)
class_correlation = correlation_matrix['Class'].sort_values(ascending=False)
print("="*80)
print("TOP 15 FEATURES CORRELATED WITH FRAUD")
print("="*80)
print(class_correlation.head(16))  # Top 15 + Class itself

print("\n" + "="*80)
print("TOP 15 FEATURES NEGATIVELY CORRELATED WITH FRAUD")
print("="*80)
print(class_correlation.tail(15))

In [None]:
# Correlation heatmap (full)
plt.figure(figsize=(20, 16))
sns.heatmap(correlation_matrix, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8},
            vmin=-1, vmax=1, annot=False)
plt.title('Correlation Matrix - All Features', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Feature correlation with Class visualization
fig, ax = plt.subplots(figsize=(14, 10))

# Get correlations and sort
class_corr_sorted = class_correlation.drop('Class').sort_values()

# Create color map based on positive/negative correlation
colors = ['#e74c3c' if x < 0 else '#2ecc71' for x in class_corr_sorted.values]

class_corr_sorted.plot(kind='barh', ax=ax, color=colors, alpha=0.8, edgecolor='black')
ax.set_title('Feature Correlation with Fraud Class', fontsize=14, fontweight='bold')
ax.set_xlabel('Correlation Coefficient', fontsize=12)
ax.set_ylabel('Features', fontsize=12)
ax.axvline(x=0, color='black', linestyle='--', linewidth=1)
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## 8. Feature Importance via Statistical Tests

In [None]:
# Perform Mann-Whitney U test for each feature
from scipy.stats import mannwhitneyu

p_values = {}
test_stats = {}

legitimate = df[df['Class'] == 0]
fraudulent = df[df['Class'] == 1]

for col in v_columns + ['Time', 'Amount']:
    stat, p_val = mannwhitneyu(legitimate[col], fraudulent[col], alternative='two-sided')
    p_values[col] = p_val
    test_stats[col] = stat

# Create results dataframe
statistical_test_results = pd.DataFrame({
    'Feature': list(p_values.keys()),
    'P-Value': list(p_values.values()),
    'Test_Statistic': list(test_stats.values()),
    'Significant': ['Yes' if p < 0.05 else 'No' for p in p_values.values()]
}).sort_values('P-Value')

print("="*80)
print("STATISTICAL SIGNIFICANCE OF FEATURES (Mann-Whitney U Test)")
print("="*80)
print(f"\nFeatures with significant difference (p < 0.05): {sum(statistical_test_results['Significant']=='Yes')} out of {len(statistical_test_results)}")
print("\nTop 15 most significant features:")
print(statistical_test_results.head(15))

In [None]:
# Visualize p-values
fig, ax = plt.subplots(figsize=(14, 10))

# Use negative log of p-values for better visualization
neg_log_p = -np.log10(statistical_test_results['P-Value'])
colors_sig = ['#e74c3c' if x == 'Yes' else '#95a5a6' 
              for x in statistical_test_results['Significant']]

ax.barh(statistical_test_results['Feature'], neg_log_p, 
        color=colors_sig, alpha=0.8, edgecolor='black')
ax.axvline(x=-np.log10(0.05), color='black', linestyle='--', 
           linewidth=2, label='Significance threshold (p=0.05)')
ax.set_xlabel('-log10(P-Value)', fontsize=12)
ax.set_ylabel('Features', fontsize=12)
ax.set_title('Statistical Significance of Features\n(Higher = More Discriminative)', 
             fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## 9. Top Discriminative Features Deep Dive

In [None]:
# Select top 4 most correlated features (positive and negative)
top_positive = class_correlation.drop('Class').nlargest(2).index.tolist()
top_negative = class_correlation.drop('Class').nsmallest(2).index.tolist()
top_features = top_positive + top_negative

print("Most discriminative features:")
print(f"  Positive correlation: {top_positive}")
print(f"  Negative correlation: {top_negative}")

In [None]:
# Detailed visualization of top features
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()

for idx, feature in enumerate(top_features):
    # KDE plot
    legitimate[feature].plot(kind='kde', ax=axes[idx], label='Legitimate', 
                             color='#2ecc71', linewidth=2.5)
    fraudulent[feature].plot(kind='kde', ax=axes[idx], label='Fraud', 
                            color='#e74c3c', linewidth=2.5)
    axes[idx].set_title(f'{feature} Distribution by Class\nCorrelation: {class_correlation[feature]:.4f}', 
                       fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(feature, fontsize=11)
    axes[idx].set_ylabel('Density', fontsize=11)
    axes[idx].legend(fontsize=10)
    axes[idx].grid(True, alpha=0.3)

plt.suptitle('Top Discriminative Features - Density Comparison', 
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 10. Multi-Feature Relationships

In [None]:
# Scatter plot of top 2 features
fig, ax = plt.subplots(figsize=(14, 10))

# Sample data to avoid overplotting
sample_size = min(5000, len(fraudulent))
legit_sample = legitimate.sample(n=sample_size, random_state=42)
fraud_sample = fraudulent.sample(n=min(sample_size, len(fraudulent)), random_state=42)

ax.scatter(legit_sample[top_features[0]], legit_sample[top_features[1]], 
          alpha=0.5, s=30, label='Legitimate', color='#2ecc71', edgecolors='black', linewidth=0.5)
ax.scatter(fraud_sample[top_features[0]], fraud_sample[top_features[1]], 
          alpha=0.7, s=50, label='Fraud', color='#e74c3c', edgecolors='black', linewidth=0.5)

ax.set_xlabel(top_features[0], fontsize=12)
ax.set_ylabel(top_features[1], fontsize=12)
ax.set_title(f'Scatter Plot: {top_features[0]} vs {top_features[1]}', 
             fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Pairplot of top features (sampled)
sample_df = pd.concat([
    legitimate.sample(n=1000, random_state=42),
    fraudulent.sample(n=min(1000, len(fraudulent)), random_state=42)
])

pairplot_features = top_features[:3] + ['Class']  # Top 3 features + Class
g = sns.pairplot(sample_df[pairplot_features], hue='Class', 
                 palette={0: '#2ecc71', 1: '#e74c3c'},
                 diag_kind='kde', plot_kws={'alpha': 0.6, 's': 40, 'edgecolor': 'black'},
                 height=3)
g.fig.suptitle('Pairplot of Top Discriminative Features', 
               fontsize=14, fontweight='bold', y=1.02)
plt.show()

## 11. Key Insights and Summary

In [None]:
print("="*80)
print("KEY INSIGHTS FROM EDA")
print("="*80)

print(f"\n1. DATASET OVERVIEW")
print(f"   ‚Ä¢ Total transactions: {len(df):,}")
print(f"   ‚Ä¢ Features: {df.shape[1]} (28 PCA features + Time + Amount)")
print(f"   ‚Ä¢ No missing values: ‚úÖ")
print(f"   ‚Ä¢ No duplicates: ‚úÖ")

print(f"\n2. CLASS IMBALANCE")
print(f"   ‚Ä¢ Legitimate transactions: {class_counts[0]:,} ({class_pct[0]:.2f}%)")
print(f"   ‚Ä¢ Fraudulent transactions: {class_counts[1]:,} ({class_pct[1]:.2f}%)")
print(f"   ‚Ä¢ Imbalance ratio: {imbalance_ratio:.2f}:1")
print(f"   ‚Ä¢ ‚ö†Ô∏è  Requires special handling (SMOTE, class weights, etc.)")

print(f"\n3. TEMPORAL PATTERNS")
print(f"   ‚Ä¢ Time span: {df['Time_hours'].max():.1f} hours (~{df['Time_hours'].max()/24:.1f} days)")
print(f"   ‚Ä¢ Fraud transactions show different temporal patterns")
print(f"   ‚Ä¢ Fraud rate varies over time bins")

print(f"\n4. AMOUNT ANALYSIS")
print(f"   ‚Ä¢ Mean amount (Legitimate): ${legit_amounts.mean():.2f}")
print(f"   ‚Ä¢ Mean amount (Fraud): ${fraud_amounts.mean():.2f}")
print(f"   ‚Ä¢ Median amount (Legitimate): ${legit_amounts.median():.2f}")
print(f"   ‚Ä¢ Median amount (Fraud): ${fraud_amounts.median():.2f}")
print(f"   ‚Ä¢ Statistical difference: {'Yes (p<0.05)' if p_value < 0.05 else 'No'}")

print(f"\n5. MOST IMPORTANT FEATURES (by correlation with fraud)")
top_5_corr = class_correlation.drop('Class').abs().nlargest(5)
for i, (feat, corr) in enumerate(top_5_corr.items(), 1):
    print(f"   {i}. {feat}: {class_correlation[feat]:.4f}")

print(f"\n6. STATISTICALLY SIGNIFICANT FEATURES")
n_significant = sum(statistical_test_results['Significant'] == 'Yes')
print(f"   ‚Ä¢ {n_significant}/{len(statistical_test_results)} features show significant difference (p<0.05)")
print(f"   ‚Ä¢ All V features are discriminative to varying degrees")

print(f"\n7. RECOMMENDATIONS FOR MODELING")
print(f"   ‚úì Use stratified splitting due to class imbalance")
print(f"   ‚úì Apply class balancing techniques (SMOTE, class weights)")
print(f"   ‚úì Focus on Precision-Recall metrics (not just accuracy)")
print(f"   ‚úì Consider ensemble methods (Random Forest, XGBoost)")
print(f"   ‚úì Feature scaling already done via PCA, but scale Time/Amount")
print(f"   ‚úì Use cross-validation with stratification")

print("\n" + "="*80)

## 12. Export Summary Statistics

In [None]:
# Create summary report
summary_stats = {
    'Dataset': 'creditcard.csv',
    'Total_Transactions': len(df),
    'Legitimate_Count': class_counts[0],
    'Fraud_Count': class_counts[1],
    'Fraud_Rate_%': class_pct[1],
    'Imbalance_Ratio': imbalance_ratio,
    'Time_Span_Hours': df['Time_hours'].max(),
    'Mean_Amount_Legitimate': legit_amounts.mean(),
    'Mean_Amount_Fraud': fraud_amounts.mean(),
    'Median_Amount_Legitimate': legit_amounts.median(),
    'Median_Amount_Fraud': fraud_amounts.median(),
    'Top_Feature_1': top_5_corr.index[0],
    'Top_Feature_1_Correlation': class_correlation[top_5_corr.index[0]],
    'Top_Feature_2': top_5_corr.index[1],
    'Top_Feature_2_Correlation': class_correlation[top_5_corr.index[1]],
    'Significant_Features_Count': n_significant
}

summary_df = pd.DataFrame([summary_stats])
print("\nüìä SUMMARY STATISTICS")
print(summary_df.T)

# Optionally save to CSV
# summary_df.to_csv('../data/processed/creditcard_eda_summary.csv', index=False)
# print("\n‚úÖ Summary saved to: ../data/processed/creditcard_eda_summary.csv")

---
## End of EDA

**Next Steps:**
1. Feature engineering (if needed beyond PCA features)
2. Train-test split with stratification
3. Apply class balancing (SMOTE)
4. Model training and evaluation
5. Hyperparameter tuning
6. Model interpretation with SHAP