# STEP 3 â€” FEATURE ENGINEERING
### Objective:
- Create fraud-specific behavioral features
- Transform raw data into predictive signals
- NO target leakage (no group-by with isFraud)

In [1]:
# Import libraries
import pandas as pd
import numpy as np

In [2]:
# Step 3A: Load Cleaned Dataset (from Step 2)


train = pd.read_parquet("../data/raw/train_merged.parquet")

print("=" * 60)
print("STEP 3A: Dataset Loaded")
print("=" * 60)
print(f"Dataset shape: {train.shape}")
print(f"\nTarget distribution:")
print(train["isFraud"].value_counts(normalize=True))
print()

STEP 3A: Dataset Loaded
Dataset shape: (590540, 434)

Target distribution:
isFraud
0    0.96501
1    0.03499
Name: proportion, dtype: float64



In [3]:
# Step 3B: Transaction Amount Features
# Log transformation handles skewness
# Threshold flags capture extreme values


print("=" * 60)
print("STEP 3B: Transaction Amount Features")
print("=" * 60)

# Log transformation (handles skewed distribution)
train["log_transaction_amt"] = np.log1p(train["TransactionAmt"])

# Extreme value flags
q10 = train["TransactionAmt"].quantile(0.10)
q90 = train["TransactionAmt"].quantile(0.90)

train["is_low_amt"] = (train["TransactionAmt"] <= q10).astype("int8")
train["is_high_amt"] = (train["TransactionAmt"] >= q90).astype("int8")

print("âœ“ Created 3 transaction amount features")
print(f"\nAmount Distribution:")
print(train[["TransactionAmt", "log_transaction_amt"]].describe())
print(f"\nExtreme Value Proportions:")
print(f"  Low amount (â‰¤10th percentile): {train['is_low_amt'].mean():.2%}")
print(f"  High amount (â‰¥90th percentile): {train['is_high_amt'].mean():.2%}")
print()

# Fraud rate validation
print("Fraud Rate by Amount Category:")
print(f"  Low amount: {train[train['is_low_amt']==1]['isFraud'].mean():.2%}")
print(f"  High amount: {train[train['is_high_amt']==1]['isFraud'].mean():.2%}")
print()

STEP 3B: Transaction Amount Features
âœ“ Created 3 transaction amount features

Amount Distribution:
       TransactionAmt  log_transaction_amt
count   590540.000000        590540.000000
mean       135.027176             4.382960
std        239.162522             0.937183
min          0.251000             0.223943
25%         43.321000             3.791459
50%         68.769000             4.245190
75%        125.000000             4.836282
max      31937.391000            10.371564

Extreme Value Proportions:
  Low amount (â‰¤10th percentile): 10.08%
  High amount (â‰¥90th percentile): 10.00%

Fraud Rate by Amount Category:
  Low amount: 5.59%
  High amount: 5.10%



In [4]:
# Step 3C: Card Features
# Card combinations detect reused/compromised cards


print("=" * 60)
print("STEP 3C: Card Features")
print("=" * 60)

# Card combination (composite key)
train["card_combo"] = (
    train["card1"].astype(str) + "_" +
    train["card2"].astype(str) + "_" +
    train["card3"].astype(str)
)

# Card type flags
train["has_card2"] = train["card2"].notna().astype("int8")
train["has_card3"] = train["card3"].notna().astype("int8")
train["has_card5"] = train["card5"].notna().astype("int8")

print("âœ“ Created 4 card features")
print(f"\nCard Presence Rates:")
print(f"  card2: {train['has_card2'].mean():.2%}")
print(f"  card3: {train['has_card3'].mean():.2%}")
print(f"  card5: {train['has_card5'].mean():.2%}")
print()

STEP 3C: Card Features
âœ“ Created 4 card features

Card Presence Rates:
  card2: 98.49%
  card3: 99.73%
  card5: 99.28%



In [5]:
# Step 3D: Address Features
# Address matching detects shipping fraud


print("=" * 60)
print("STEP 3D: Address Features")
print("=" * 60)

# Address mismatch (billing â‰  shipping)
train["addr_mismatch"] = (
    (train["addr1"] != train["addr2"]) & 
    train["addr1"].notna() & 
    train["addr2"].notna()
).astype("int8")

print("âœ“ Created 1 address feature")
print(f"  Address mismatch rate: {train['addr_mismatch'].mean():.2%}")
print(f"  Fraud rate when mismatch: {train[train['addr_mismatch']==1]['isFraud'].mean():.2%}")
print()


STEP 3D: Address Features
âœ“ Created 1 address feature
  Address mismatch rate: 88.87%
  Fraud rate when mismatch: 2.46%



In [6]:
# Step 3E: Device Features
# Device type presence indicates device fingerprinting


print("=" * 60)
print("STEP 3E: Device Features")
print("=" * 60)

# Device presence flags
train["has_device_type"] = train["DeviceType"].notna().astype("int8")
train["has_device_info"] = train["DeviceInfo"].notna().astype("int8")

print("âœ“ Created 2 device features")
print(f"  DeviceType present: {train['has_device_type'].mean():.2%}")
print(f"  DeviceInfo present: {train['has_device_info'].mean():.2%}")
print()

STEP 3E: Device Features
âœ“ Created 2 device features
  DeviceType present: 23.84%
  DeviceInfo present: 20.09%



In [7]:
# Step 3F: Email & Domain Features
# Email presence/matching detects account fraud


print("=" * 60)
print("STEP 3F: Email & Domain Features")
print("=" * 60)

# Email presence
train["has_P_email"] = train["P_emaildomain"].notna().astype("int8")
train["has_R_email"] = train["R_emaildomain"].notna().astype("int8")

# Email domain matching
train["email_domain_match"] = (
    (train["P_emaildomain"] == train["R_emaildomain"]) &
    train["P_emaildomain"].notna() &
    train["R_emaildomain"].notna()
).astype("int8")

print("âœ“ Created 3 email features")
print(f"  P_email present: {train['has_P_email'].mean():.2%}")
print(f"  R_email present: {train['has_R_email'].mean():.2%}")
print(f"  Email domains match: {train['email_domain_match'].mean():.2%}")
print(f"\nFraud rate when emails match: {train[train['email_domain_match']==1]['isFraud'].mean():.2%}")
print()

STEP 3F: Email & Domain Features
âœ“ Created 3 email features
  P_email present: 84.01%
  R_email present: 23.25%
  Email domains match: 17.36%

Fraud rate when emails match: 9.65%



In [8]:
# Step 3G: Identity Feature Aggregations
# Identity completeness signals verification quality


print("=" * 60)
print("STEP 3G: Identity Features")
print("=" * 60)

identity_cols = [c for c in train.columns if c.startswith("id_")]

# Count non-null identity features
train["identity_feature_count"] = train[identity_cols].notna().sum(axis=1).astype("int8")

# Missing ratio
train["identity_missing_ratio"] = (
    train[identity_cols].isna().sum(axis=1) / len(identity_cols)
).astype("float32")

print(f"âœ“ Created 2 identity aggregation features")
print(f"\nIdentity Feature Statistics:")
print(train[["identity_feature_count", "identity_missing_ratio"]].describe())
print(f"\nFraud vs Non-Fraud Comparison:")
print(train.groupby("isFraud")[["identity_feature_count", "identity_missing_ratio"]].mean())
print()

STEP 3G: Identity Features
âœ“ Created 2 identity aggregation features

Identity Feature Statistics:
       identity_feature_count  identity_missing_ratio
count           590540.000000           590540.000000
mean                 5.767157                0.848233
std                 10.436908                0.274655
min                  0.000000                0.000000
25%                  0.000000                1.000000
50%                  0.000000                1.000000
75%                  0.000000                1.000000
max                 38.000000                1.000000

Fraud vs Non-Fraud Comparison:
         identity_feature_count  identity_missing_ratio
isFraud                                                
0                      5.510649                0.854983
1                     12.841553                0.662064



In [9]:
# Step 3H: C/D/M/V Column Aggregations
# Count non-null features in each group


print("=" * 60)
print("STEP 3H: C/D/M/V Aggregations")
print("=" * 60)

# C features (card-related metadata)
c_cols = [c for c in train.columns if c.startswith("C")]
if c_cols:
    train["C_feature_count"] = train[c_cols].notna().sum(axis=1).astype("int8")
    print(f"âœ“ Created C_feature_count from {len(c_cols)} columns")

# D features (timedelta features)
d_cols = [c for c in train.columns if c.startswith("D")]
if d_cols:
    train["D_feature_count"] = train[d_cols].notna().sum(axis=1).astype("int8")
    print(f"âœ“ Created D_feature_count from {len(d_cols)} columns")

# M features (match/mismatch features)
m_cols = [c for c in train.columns if c.startswith("M") and c not in ["M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9"]]
if m_cols:
    train["M_feature_count"] = train[m_cols].notna().sum(axis=1).astype("int8")
    print(f"âœ“ Created M_feature_count from {len(m_cols)} columns")

# V features (Vesta engineered features)
v_cols = [c for c in train.columns if c.startswith("V")]
if v_cols:
    train["V_feature_count"] = train[v_cols].notna().sum(axis=1).astype("int8")
    print(f"âœ“ Created V_feature_count from {len(v_cols)} columns")

print()

STEP 3H: C/D/M/V Aggregations
âœ“ Created C_feature_count from 14 columns
âœ“ Created D_feature_count from 17 columns
âœ“ Created V_feature_count from 339 columns



In [10]:
# Step 3I: Temporal Features
# Time-based patterns reveal fraud timing strategies

print("=" * 60)
print("STEP 3I: Temporal Features")
print("=" * 60)

# Hour of day (0-23)
train["TransactionHour"] = ((train["TransactionDT"] / 3600) % 24).astype("int8")

# Day of week (0=Monday, 6=Sunday)
train["TransactionDayOfWeek"] = ((train["TransactionDT"] / 86400) % 7).astype("int8")

# Time period flags
train["is_night_txn"] = train["TransactionHour"].between(0, 5).astype("int8")
train["is_morning_peak"] = train["TransactionHour"].between(6, 9).astype("int8")
train["is_evening"] = train["TransactionHour"].between(18, 23).astype("int8")

# Weekend flag
train["is_weekend"] = train["TransactionDayOfWeek"].isin([5, 6]).astype("int8")

print("âœ“ Created 6 temporal features")
print(f"\nTemporal Pattern Analysis:")
print(f"  Night transactions (0-5h): {train['is_night_txn'].mean():.2%}")
print(f"  Morning peak (6-9h): {train['is_morning_peak'].mean():.2%}")
print(f"  Evening (18-23h): {train['is_evening'].mean():.2%}")
print(f"  Weekend: {train['is_weekend'].mean():.2%}")

print(f"\nFraud Rates by Time:")
print(f"  Night: {train[train['is_night_txn']==1]['isFraud'].mean():.2%}")
print(f"  Morning: {train[train['is_morning_peak']==1]['isFraud'].mean():.2%}")
print(f"  Evening: {train[train['is_evening']==1]['isFraud'].mean():.2%}")
print(f"  Weekend: {train[train['is_weekend']==1]['isFraud'].mean():.2%}")
print()

STEP 3I: Temporal Features
âœ“ Created 6 temporal features

Temporal Pattern Analysis:
  Night transactions (0-5h): 24.16%
  Morning peak (6-9h): 2.50%
  Evening (18-23h): 42.04%
  Weekend: 28.82%

Fraud Rates by Time:
  Night: 3.83%
  Morning: 8.96%
  Evening: 3.46%
  Weekend: 3.38%



In [11]:
# Step 3J: Feature Summary


print("=" * 60)
print("STEP 3J: Feature Engineering Summary")
print("=" * 60)

new_features = [
    "log_transaction_amt", "is_low_amt", "is_high_amt",
    "card_combo", "has_card2", "has_card3", "has_card5",
    "addr_mismatch",
    "has_device_type", "has_device_info",
    "has_P_email", "has_R_email", "email_domain_match",
    "identity_feature_count", "identity_missing_ratio",
    "C_feature_count", "D_feature_count", "M_feature_count", "V_feature_count",
    "TransactionHour", "TransactionDayOfWeek",
    "is_night_txn", "is_morning_peak", "is_evening", "is_weekend"
]

# Count actually created features
created_features = [f for f in new_features if f in train.columns]

print(f"Total new features created: {len(created_features)}")
print(f"Final dataset shape: {train.shape}")
print(f"\nFeature Categories:")
print(f"  Transaction Amount: 3")
print(f"  Card: 4")
print(f"  Address: 1")
print(f"  Device: 2")
print(f"  Email: 3")
print(f"  Identity: 2")
print(f"  C/D/M/V Aggregations: 4")
print(f"  Temporal: 6")
print()

STEP 3J: Feature Engineering Summary
Total new features created: 24
Final dataset shape: (590540, 458)

Feature Categories:
  Transaction Amount: 3
  Card: 4
  Address: 1
  Device: 2
  Email: 3
  Identity: 2
  C/D/M/V Aggregations: 4
  Temporal: 6



In [12]:
# Step 3K: Data Quality Check

print("=" * 60)
print("STEP 3K: Data Quality Check")
print("=" * 60)

missing_counts = train[created_features].isna().sum()
if missing_counts.sum() > 0:
    print("Features with missing values:")
    print(missing_counts[missing_counts > 0])
else:
    print("âœ“ No missing values in new features")

print()

STEP 3K: Data Quality Check
âœ“ No missing values in new features



In [13]:
# Step 3L: Save Feature-Enhanced Dataset

output_path = "../data/processed/train_features_v1.parquet"
train.to_parquet(output_path, index=False)

print("=" * 60)
print("STEP 3L: Dataset Saved")
print("=" * 60)
print(f"âœ“ Saved to: {output_path}")
print(f"âœ“ Shape: {train.shape}")
print(f"âœ“ New features: {len(created_features)}")
print("=" * 60)
print("\nðŸŽ¯ Step 3 completed successfully!")
print("   Next: Step 4 - Data Preparation & Entity Behavioral Risk Encoding")
print("=" * 60)

STEP 3L: Dataset Saved
âœ“ Saved to: ../data/processed/train_features_v1.parquet
âœ“ Shape: (590540, 458)
âœ“ New features: 24

ðŸŽ¯ Step 3 completed successfully!
   Next: Step 4 - Data Preparation & Entity Behavioral Risk Encoding
