# EDA và Data Preprocessing cho Credit Card Fraud Detection

Notebook này thực hiện:
1. Exploratory Data Analysis (EDA)
2. Data Preprocessing sử dụng các functions từ `src/data_preprocessing.py`


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import sys
from pathlib import Path
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Get project root directory
# Try to find project root by looking for src directory
current_dir = Path.cwd()
project_root = current_dir

# Check if we're in notebooks directory or project root
if (current_dir / 'src').exists():
    # We're in project root
    project_root = current_dir
elif (current_dir.parent / 'src').exists():
    # We're in notebooks directory
    project_root = current_dir.parent
else:
    # Fallback: assume we're in project root
    project_root = current_dir

# Add project root to path for imports
project_root_str = str(project_root.absolute())
if project_root_str not in sys.path:
    sys.path.insert(0, project_root_str)

# Import preprocessing functions
from src.data_preprocessing import scale_features, split_data, apply_smote, get_class_weights

print(f"Current working directory: {Path.cwd()}")
print(f"Project root: {project_root.absolute()}")
print("Libraries imported successfully!")


## 1. Load và Kiểm tra Dữ liệu Cơ bản


In [None]:
# Load data
data_path = project_root / 'data' / 'creditcard.csv'
df = pd.read_csv(data_path)

print(f"Shape of dataset: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()


In [None]:
# Basic info
print("Dataset Info:")
print(df.info())
print("\n" + "="*50)
print("\nBasic Statistics:")
df.describe()


## 2. Kiểm tra Missing Values


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percent
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print("Missing Values Found:")
    print(missing_df)
    # Visualize missing values
    plt.figure(figsize=(10, 6))
    sns.barplot(data=missing_df.reset_index(), x='index', y='Missing Percentage')
    plt.title('Missing Values by Column')
    plt.xlabel('Column')
    plt.ylabel('Missing Percentage (%)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("✓ No missing values found in the dataset!")


## 3. Kiểm tra Outliers


In [None]:
# Check outliers for Amount feature using IQR method
Q1 = df['Amount'].quantile(0.25)
Q3 = df['Amount'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_iqr = df[(df['Amount'] < lower_bound) | (df['Amount'] > upper_bound)]
print(f"Outliers detected using IQR method: {len(outliers_iqr)} ({len(outliers_iqr)/len(df)*100:.2f}%)")
print(f"Lower bound: {lower_bound:.2f}, Upper bound: {upper_bound:.2f}")

# Check outliers using Z-score method
from scipy import stats
z_scores = np.abs(stats.zscore(df['Amount']))
outliers_zscore = df[z_scores > 3]
print(f"\nOutliers detected using Z-score method (|z| > 3): {len(outliers_zscore)} ({len(outliers_zscore)/len(df)*100:.2f}%)")


In [None]:
# Visualize outliers for Amount
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Box plot
axes[0].boxplot(df['Amount'], vert=True)
axes[0].set_title('Box Plot of Amount (with outliers)')
axes[0].set_ylabel('Amount')
axes[0].grid(True, alpha=0.3)

# Histogram
axes[1].hist(df['Amount'], bins=50, edgecolor='black', alpha=0.7)
axes[1].axvline(lower_bound, color='r', linestyle='--', label=f'Lower bound: {lower_bound:.2f}')
axes[1].axvline(upper_bound, color='r', linestyle='--', label=f'Upper bound: {upper_bound:.2f}')
axes[1].set_title('Distribution of Amount')
axes[1].set_xlabel('Amount')
axes[1].set_ylabel('Frequency')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Log transform visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Original
axes[0].hist(df['Amount'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Original Amount Distribution')
axes[0].set_xlabel('Amount')
axes[0].set_ylabel('Frequency')

# Log transformed (add small value to avoid log(0))
axes[1].hist(np.log1p(df['Amount']), bins=50, edgecolor='black', alpha=0.7)
axes[1].set_title('Log-transformed Amount Distribution')
axes[1].set_xlabel('Log(Amount + 1)')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


## 4. Phân phối các Features


In [None]:
# Distribution of Amount by Class
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
df[df['Class'] == 0]['Amount'].hist(bins=50, alpha=0.7, label='Normal (0)', ax=axes[0])
df[df['Class'] == 1]['Amount'].hist(bins=50, alpha=0.7, label='Fraud (1)', ax=axes[0])
axes[0].set_title('Distribution of Amount by Class')
axes[0].set_xlabel('Amount')
axes[0].set_ylabel('Frequency')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Box plot
df.boxplot(column='Amount', by='Class', ax=axes[1])
axes[1].set_title('Amount Distribution by Class')
axes[1].set_xlabel('Class')
axes[1].set_ylabel('Amount')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Statistics by class
print("Amount Statistics by Class:")
print(df.groupby('Class')['Amount'].describe())


In [None]:
# Sample some V features to visualize (V1-V28 are PCA transformed)
# Let's visualize a few representative features
sample_features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10']

fig, axes = plt.subplots(2, 5, figsize=(20, 8))
axes = axes.flatten()

for i, feature in enumerate(sample_features):
    df[df['Class'] == 0][feature].hist(bins=50, alpha=0.7, label='Normal', ax=axes[i], color='blue')
    df[df['Class'] == 1][feature].hist(bins=50, alpha=0.7, label='Fraud', ax=axes[i], color='red')
    axes[i].set_title(f'Distribution of {feature}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Frequency')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Compare distributions for all V features (density plots)
v_features = [f'V{i}' for i in range(1, 29)]

# Select a subset for visualization (every 3rd feature to avoid overcrowding)
selected_v_features = v_features[::3]  # V1, V4, V7, V10, V13, V16, V19, V22, V25, V28

fig, axes = plt.subplots(2, 5, figsize=(20, 8))
axes = axes.flatten()

for i, feature in enumerate(selected_v_features):
    df[df['Class'] == 0][feature].plot.density(ax=axes[i], label='Normal', color='blue', alpha=0.7)
    df[df['Class'] == 1][feature].plot.density(ax=axes[i], label='Fraud', color='red', alpha=0.7)
    axes[i].set_title(f'Density Plot of {feature}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Density')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 5. Kiểm tra Tỉ lệ Fraud/Normal (Class Imbalance)


In [None]:
# Class distribution
class_counts = df['Class'].value_counts()
class_percentages = df['Class'].value_counts(normalize=True) * 100

print("Class Distribution:")
print(f"Normal (0): {class_counts[0]:,} ({class_percentages[0]:.2f}%)")
print(f"Fraud (1): {class_counts[1]:,} ({class_percentages[1]:.2f}%)")
print(f"\nImbalance Ratio: {class_counts[0]/class_counts[1]:.2f}:1 (Normal:Fraud)")


In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar chart
class_counts.plot(kind='bar', ax=axes[0], color=['blue', 'red'], alpha=0.7)
axes[0].set_title('Class Distribution (Bar Chart)')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['Normal (0)', 'Fraud (1)'], rotation=0)
axes[0].grid(True, alpha=0.3, axis='y')
for i, v in enumerate(class_counts):
    axes[0].text(i, v, f'{v:,}', ha='center', va='bottom', fontweight='bold')

# Pie chart
class_counts.plot(kind='pie', ax=axes[1], autopct='%1.2f%%', colors=['blue', 'red'], startangle=90)
axes[1].set_title('Class Distribution (Pie Chart)')
axes[1].set_ylabel('')
axes[1].legend(['Normal (0)', 'Fraud (1)'])

plt.tight_layout()
plt.show()

print("\n⚠️ Dataset is highly imbalanced!")
print("This will require special handling during model training.")


## 6. Data Preprocessing

Sử dụng các functions từ `src/data_preprocessing.py` để thực hiện preprocessing.


In [None]:
# Prepare features and target
feature_cols = [f'V{i}' for i in range(1, 29)] + ['Amount']
X = df[feature_cols]
y = df['Class']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {feature_cols[:5]}... (total {len(feature_cols)} features)")


### 6.1 Scaling Features


In [None]:
# Scale features using StandardScaler
X_scaled, scaler = scale_features(X, feature_cols=feature_cols, fit=True)

print("Features scaled successfully!")
print(f"Scaled features shape: {X_scaled.shape}")
print(f"\nScaler statistics:")
print(f"Mean: {scaler.mean_[:5]}... (showing first 5)")
print(f"Scale: {scaler.scale_[:5]}... (showing first 5)")

# Verify scaling
print("\nVerification - Statistics after scaling:")
print(X_scaled[feature_cols].describe().loc[['mean', 'std']])


### 6.2 Train/Validation/Test Split


In [None]:
# Split data: 70% train, 15% validation, 15% test
X_train, X_val, X_test, y_train, y_val, y_test = split_data(
    X_scaled, y, 
    test_size=0.15, 
    val_size=0.15, 
    random_state=42
)

print("Data split successfully!")
print(f"\nTrain set: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"Validation set: {X_val.shape[0]:,} samples ({X_val.shape[0]/len(df)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]:,} samples ({X_test.shape[0]/len(df)*100:.1f}%)")

# Check class distribution in each split
print("\nClass distribution in each split:")
print("\nTrain set:")
print(y_train.value_counts())
print(f"Fraud ratio: {y_train.value_counts()[1]/len(y_train)*100:.2f}%")

print("\nValidation set:")
print(y_val.value_counts())
print(f"Fraud ratio: {y_val.value_counts()[1]/len(y_val)*100:.2f}%")

print("\nTest set:")
print(y_test.value_counts())
print(f"Fraud ratio: {y_test.value_counts()[1]/len(y_test)*100:.2f}%")


### 6.3 Kế hoạch Xử lý Imbalance

Có 2 phương pháp chính để xử lý class imbalance:

1. **SMOTE (Synthetic Minority Oversampling Technique)**: Tạo synthetic samples cho minority class
2. **Class Weights**: Điều chỉnh trọng số trong quá trình training

Cả 2 methods đã được implement trong `src/data_preprocessing.py`. Dưới đây là demo cách sử dụng:


In [None]:
# Method 1: Calculate class weights
class_weights = get_class_weights(y_train)
print("Class Weights (for use with class_weight parameter in models):")
print(class_weights)
print("\nUsage example:")
print("model = RandomForestClassifier(class_weight=class_weights)")
print("model.fit(X_train, y_train)")


In [None]:
# Method 2: Apply SMOTE (only on training data!)
# Note: SMOTE should ONLY be applied to training data, not validation/test
print("Applying SMOTE to training data...")
X_train_smote, y_train_smote = apply_smote(X_train, y_train, random_state=42)

print(f"\nBefore SMOTE:")
print(f"  Shape: {X_train.shape}")
print(f"  Class distribution: {y_train.value_counts().to_dict()}")

print(f"\nAfter SMOTE:")
print(f"  Shape: {X_train_smote.shape}")
unique, counts = np.unique(y_train_smote, return_counts=True)
print(f"  Class distribution: {dict(zip(unique, counts))}")

print("\n⚠️ Note: Use X_train_smote, y_train_smote for training if using SMOTE")
print("⚠️ Keep X_val, y_val and X_test, y_test unchanged for validation and testing")


### 6.4 So sánh Phương pháp Xử lý Imbalance

**SMOTE:**
- ✅ Tăng số lượng samples cho minority class
- ✅ Tạo synthetic samples thay vì duplicate
- ❌ Có thể tạo noisy samples nếu minority class quá nhỏ
- ❌ Tốn thời gian tính toán
- ❌ Chỉ nên áp dụng trên training data

**Class Weights:**
- ✅ Không cần thay đổi dữ liệu
- ✅ Nhanh hơn SMOTE
- ✅ Hoạt động tốt với tree-based models
- ✅ Giữ nguyên phân phối dữ liệu gốc
- ❌ Có thể không hiệu quả bằng SMOTE cho một số models
- ❌ Cần model hỗ trợ class_weight parameter

**Khuyến nghị:**
- Thử cả 2 phương pháp và so sánh kết quả
- Với dataset này (imbalance ratio ~577:1), nên thử cả SMOTE và class_weight
- Có thể kết hợp cả 2: SMOTE + class_weight cho một số models


## 7. Lưu Preprocessed Data (Optional)

Có thể lưu các preprocessed data và scaler để sử dụng sau này.


In [None]:
# Save scaler (optional - for future use)
import pickle

# Create directory if it doesn't exist
models_dir = project_root / 'models'
models_dir.mkdir(exist_ok=True)

# Save scaler
scaler_path = models_dir / 'scaler.pkl'
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)

print(f"Scaler saved to '{scaler_path}'")
print("\nPreprocessing completed successfully!")
print("\nNext steps:")
print("1. Use X_train, y_train (or X_train_smote, y_train_smote) for training")
print("2. Use X_val, y_val for validation during training")
print("3. Use X_test, y_test for final evaluation")
print("4. Use class_weights or SMOTE to handle imbalance")
