# STEP 4 - Fraud_Detection - Data Preparation & Entity Behavioral Risk Encoding

### Objective:
- Prepare data for modeling with fraud-specific behavioral intelligence
- No target leakage (frequency encoding uses NO target info)
- Optimize memory & ensure zero NaNs before modeling

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Step 4A: Load Feature-Engineered Dataset
# ============================================================

train = pd.read_parquet("../data/processed/train_features_v1.parquet")

print("=" * 60)
print("STEP 4A: Dataset Loaded")
print("=" * 60)
print("Dataset shape:", train.shape)
print("\nTarget distribution:")
print(train["isFraud"].value_counts(normalize=True))
print()

STEP 4A: Dataset Loaded
Dataset shape: (590540, 458)

Target distribution:
isFraud
0    0.96501
1    0.03499
Name: proportion, dtype: float64



In [3]:
# Step 4B: Define Feature Groups
# ============================================================

TARGET = "isFraud"

numeric_cols = train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = train.select_dtypes(include=["object"]).columns.tolist()

if TARGET in numeric_cols:
    numeric_cols.remove(TARGET)

print("=" * 60)
print("STEP 4B: Feature Groups Defined")
print("=" * 60)
print(f"Numeric features: {len(numeric_cols)}")
print(f"Categorical features: {len(categorical_cols)}")
print()

STEP 4B: Feature Groups Defined
Numeric features: 403
Categorical features: 32



In [4]:
# Step 4C: FREQUENCY ENCODING REMOVED (MOVED TO STEP 5)
# ============================================================

print("=" * 60)
print("STEP 4C: Frequency Encoding SKIPPED")
print("=" * 60)
print("‚ö†Ô∏è  CRITICAL: Frequency features will be calculated in Step 5")
print("    AFTER train/valid split to prevent data leakage")
print()

STEP 4C: Frequency Encoding SKIPPED
‚ö†Ô∏è  CRITICAL: Frequency features will be calculated in Step 5
    AFTER train/valid split to prevent data leakage



In [5]:
# Step 4D: Type Optimization
# ============================================================

print("=" * 60)
print("STEP 4D: Type Optimization")
print("=" * 60)

memory_before = train.memory_usage(deep=True).sum() / 1024**2

# Float64 ‚Üí Float32
float_cols = train.select_dtypes(include=["float64"]).columns
for col in float_cols:
    train[col] = train[col].astype("float32")

# Int64 ‚Üí Int32
int_cols = train.select_dtypes(include=["int64"]).columns
for col in int_cols:
    if col != TARGET:
        train[col] = train[col].astype("int32")

memory_after = train.memory_usage(deep=True).sum() / 1024**2

print(f"‚úì Memory reduced: {memory_before:.2f} MB ‚Üí {memory_after:.2f} MB")
print(f"  ({100 * (1 - memory_after/memory_before):.1f}% reduction)")
print()

STEP 4D: Type Optimization
‚úì Memory reduced: 2540.50 MB ‚Üí 1632.65 MB
  (35.7% reduction)



In [6]:
# Step 4E: Updated Feature Summary
# ============================================================

numeric_cols_updated = train.select_dtypes(include=["int32", "int64", "float32", "float64"]).columns.tolist()
categorical_cols_updated = train.select_dtypes(include=["object"]).columns.tolist()

if TARGET in numeric_cols_updated:
    numeric_cols_updated.remove(TARGET)

print("=" * 60)
print("STEP 4E: Feature Summary")
print("=" * 60)
print(f"Numeric features: {len(numeric_cols_updated)}")
print(f"Categorical features: {len(categorical_cols_updated)}")
print(f"Total features: {train.shape[1]}")
print()

STEP 4E: Feature Summary
Numeric features: 404
Categorical features: 32
Total features: 458



In [7]:
# Step 4F: FINAL SAFETY IMPUTATION
# ============================================================

print("=" * 60)
print("STEP 4F: Final Safety Imputation")
print("=" * 60)

missing_before = train.isna().sum().sum()
print(f"Missing values before imputation: {missing_before:,}")

# Numeric imputation
for col in numeric_cols_updated:
    if train[col].isna().any():
        train[col] = train[col].fillna(train[col].median())

# Categorical imputation
for col in categorical_cols_updated:
    if train[col].isna().any():
        train[col] = train[col].fillna("Unknown")

missing_after = train.isna().sum().sum()
print(f"Missing values after imputation: {missing_after:,}")
print()

STEP 4F: Final Safety Imputation
Missing values before imputation: 115,523,073
Missing values after imputation: 0



In [8]:
# Step 4G: Final Dataset Validation
# ============================================================

print("=" * 60)
print("STEP 4G: Final Dataset Validation")
print("=" * 60)
print(f"Final dataset shape: {train.shape}")
print(f"Remaining missing values: {train.isna().sum().sum()}")
print(f"Memory usage: {train.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print()

STEP 4G: Final Dataset Validation
Final dataset shape: (590540, 458)
Remaining missing values: 0
Memory usage: 2065.39 MB



In [9]:
# Step 4H: Save Prepared Dataset
# ============================================================

output_path = "../data/processed/train_features_v2.parquet"
train.to_parquet(output_path, index=False)

print("=" * 60)
print("STEP 4H: Dataset Saved")
print("=" * 60)
print(f"‚úì Saved to: {output_path}")
print(f"‚úì Shape: {train.shape}")
print(f"‚úì Ready for modeling (frequency encoding in Step 5)")
print("=" * 60)
print("\nüéØ Step 4 completed successfully!")
print("   Next: Step 5 - Model Training (with proper frequency encoding)")
print("=" * 60)

STEP 4H: Dataset Saved
‚úì Saved to: ../data/processed/train_features_v2.parquet
‚úì Shape: (590540, 458)
‚úì Ready for modeling (frequency encoding in Step 5)

üéØ Step 4 completed successfully!
   Next: Step 5 - Model Training (with proper frequency encoding)
