### Setup Python Path (Consistent with other notebooks)

In [None]:
import sys
import os

# Temporary add to import config
project_root_guess = os.path.abspath(os.path.join(os.getcwd(), '..'))
temp_src_path = os.path.join(project_root_guess, 'src')
if temp_src_path not in sys.path:
    sys.path.append(temp_src_path)

# Import PROJECT_ROOT and config values
from config import PROJECT_ROOT, CREDITCARD_FULL_PATH

# Remove temp and add correct path
if temp_src_path in sys.path:
    sys.path.remove(temp_src_path)

correct_src_path = os.path.join(PROJECT_ROOT, 'src')
if correct_src_path not in sys.path:
    sys.path.append(correct_src_path)

print(f"EDA Notebook Ready for Credit Card Data")
print(f"Project Root: {PROJECT_ROOT}")
print(f"Loading from: {CREDITCARD_FULL_PATH}")

###  Imports and Visualization Setup

In [None]:
# Imports and Visualization Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('seaborn-v0_8')
sns.set_palette('muted')
plt.rcParams['figure.figsize'] = (12, 6)

print("EDA Notebook: Credit Card Fraud Data (creditcard.csv)")

### Load Dataset

In [None]:
# Load Dataset
print("Loading creditcard.csv...\n")

credit_df = pd.read_csv(CREDITCARD_FULL_PATH)

print(f"✅ Dataset loaded: {credit_df.shape[0]:,} rows × {credit_df.shape[1]} columns")
print(f"Features: Time, V1–V28 (PCA-transformed), Amount, Class")

credit_df.head()

In [None]:
# Cell 3: Basic Data Properties
from IPython.display import display, Markdown

print("=== 1. Dataset Overview ===\n")
display(Markdown(f"**Shape**: {credit_df.shape[0]:,} rows × {credit_df.shape[1]} columns"))
display(Markdown(f"**Columns**: {', '.join(credit_df.columns)}"))

print("\n=== 2. Data Types & Info ===\n")
credit_df.info()

print("\n=== 3. Missing Values ===\n")
missing = credit_df.isnull().sum()
if missing.sum() == 0:
    display(Markdown("**No missing values found!** ✅"))
else:
    display(missing[missing > 0])

print("\n=== 4. Duplicate Rows ===\n")
duplicates = credit_df.duplicated().sum()
display(Markdown(f"**Duplicate transactions**: {duplicates:,} (common in real data; will be removed in preprocessing)"))

print("\n=== 5. Descriptive Statistics (Numerical Features) ===\n")
display(credit_df.describe().T.round(4).style.background_gradient(cmap='Blues'))

In [None]:
# Cell 4: Target Variable - Extreme Class Imbalance
print("=== Class Distribution (Target: 'Class') ===\n")

class_counts = credit_df['Class'].value_counts()
class_pct = credit_df['Class'].value_counts(normalize=True) * 100

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))

# Count plot
sns.countplot(data=credit_df, x='Class', ax=ax1, palette=['#1f77b4', '#d62728'])
ax1.set_title('Class Distribution (Count)', fontsize=14, fontweight='bold')
for p in ax1.patches:
    height = p.get_height()
    ax1.annotate(f'{int(height):,}', (p.get_x() + p.get_width()/2., height + 5000),
                 ha='center', va='bottom', fontsize=12, fontweight='bold')

# Log scale count (to see fraud)
credit_df['Class'].value_counts().plot(kind='bar', logy=True, ax=ax2, color=['#1f77b4', '#d62728'])
ax2.set_title('Class Distribution (Log Scale)', fontsize=14, fontweight='bold')

# Pie chart
ax3.pie(class_counts, labels=['Legitimate', 'Fraud'], autopct='%1.4f%%',
        colors=['#1f77b4', '#d62728'], textprops={'fontsize': 12, 'fontweight': 'bold'})
ax3.set_title('Fraud Proportion', fontsize=14, fontweight='bold')

plt.suptitle('Extreme Class Imbalance – Typical in Fraud Detection', fontsize=18, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

print(f"Total transactions: {len(credit_df):,}")
print(f"Fraudulent: {class_counts[1]:,} ({class_pct[1]:.4f}%)")
print(f"Legitimate: {class_counts[0]:,} ({class_pct[0]:.4f}%)")
print("Challenge: Highly imbalanced → need PR-AUC, F1, or oversampling/undersampling")

In [None]:
# Cell 5: Transaction Time and Amount Analysis
print("=== Time and Amount Features ===\n")

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Time distribution (seconds from first transaction)
sns.histplot(credit_df['Time'], bins=100, kde=True, ax=axes[0,0], color='purple')
axes[0,0].set_title('Distribution of Transaction Time (seconds since first)')
axes[0,0].set_xlabel('Time (seconds)')

# Convert Time to hours for periodicity
credit_df['Time_hours'] = credit_df['Time'] / 3600
sns.histplot(credit_df['Time_hours'] % 24, bins=48, kde=True, ax=axes[0,1], color='teal')
axes[0,1].set_title('Transactions by Hour of Day (Periodic Pattern)')
axes[0,1].set_xlabel('Hour of Day')

# Amount distribution
sns.histplot(credit_df['Amount'], bins=100, kde=True, ax=axes[1,0], color='orange')
axes[1,0].set_title('Transaction Amount Distribution')
axes[1,0].set_xlim(0, 2000)  # Focus on majority
axes[1,0].set_xlabel('Amount ($)')

# Amount by Class (zoomed)
sns.boxplot(data=credit_df, x='Class', y='Amount', ax=axes[1,1], palette='Set2')
axes[1,1].set_title('Amount by Class (zoomed)')
axes[1,1].set_ylim(0, 400)

plt.suptitle('Time and Amount Patterns', fontsize=18, fontweight='bold')
plt.tight_layout()
plt.show()

print("Insight: Fraudulent transactions tend to have smaller amounts and occur at different times.")

In [None]:
# Cell 6: PCA Features (V1–V28) - Sample Exploration
print("=== Anonymized PCA Features (V1–V28) ===\n")

# Select a sample of 8 V features for visualization
v_features = [f'V{i}' for i in range(1, 29)]
sample_v = v_features[:8]

fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for i, col in enumerate(sample_v):
    sns.boxplot(data=credit_df, x='Class', y=col, ax=axes[i], palette='husl')
    axes[i].set_title(f'{col} by Class')

plt.suptitle('Sample PCA Components (V1–V8) by Fraud Class', fontsize=18, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

print("Insight: Several V features show clear separation between fraud and legitimate — PCA preserved discriminative power.")

In [None]:
# Cell 7: Correlation with Target (Top Features)
print("=== Feature Importance Hint: Correlation with Class ===\n")

correlations = credit_df.corr()['Class'].drop('Class').abs().sort_values(ascending=False)
top_corr = correlations.head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_corr.values, y=top_corr.index, palette='viridis')
plt.title('Top 10 Features by Absolute Correlation with Fraud Class')
plt.xlabel('Absolute Correlation')
plt.tight_layout()
plt.show()

display(top_corr.round(4).to_frame('Abs Correlation'

In [None]:
# Cell 8: Summary of Key Insights
print("CREDIT CARD DATA EDA SUMMARY\n")
print("• Extremely imbalanced: Only ~0.17% fraudulent transactions")
print("• Features V1–V28 are PCA-transformed and already scaled (privacy-protected)")
print("• Amount: Fraud cases typically involve lower amounts")
print("• Time: Shows daily periodicity; fraud timing differs slightly")
print("• Several V components (e.g., V11, V4, V14, V17) strongly correlate with fraud")
print("• No missing values; duplicates present (will be removed in preprocessing)")
print("\n Dataset ready for modeling — recommend:")
print("   - Use PR-AUC as primary metric")
print("   - Apply SMOTE or class weighting")
print("   - Tree-based models (XGBoost, LightGBM) likely to perform well on PCA features")