In [29]:
# PHASE 3 STEP 11: LOAD AND INSPECT DATA

# Import libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load data
print("Loading data...")
df = pd.read_csv('../data/raw/creditcard.csv')
print(f"✓ Data loaded: {df.shape}")

# Quick inspection
print("\n" + "="*70)
print("DATA QUALITY CHECK")
print("="*70)
print(f"Total transactions: {len(df):,}")
print(f"Total features: {df.shape[1]}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")

# Handle duplicates
print("\n" + "="*70)
print("REMOVING DUPLICATES")
print("="*70)
duplicates_count = df.duplicated().sum()
print(f"Duplicates found: {duplicates_count}")

if duplicates_count > 0:
    df = df.drop_duplicates()
    print(f"✓ Removed {duplicates_count:,} duplicate rows")
    print(f"✓ Clean dataset size: {df.shape}")
    print(f"✓ Duplicates remaining: {df.duplicated().sum()}")
else:
    print("✓ No duplicates found")

# Class distribution
print("\n" + "="*70)
print("CLASS DISTRIBUTION")
print("="*70)
print(df['Class'].value_counts())
print(f"\nFraud rate: {df['Class'].mean()*100:.3f}%")


# PRE-MODELING VALIDATION CHECKS

print("\n" + "="*70)
print("CHECK 1: CLASS IMBALANCE SEVERITY")
print("="*70)
fraud_count = df['Class'].sum()
legit_count = len(df) - fraud_count
imbalance_ratio = legit_count / fraud_count

print(f"Legitimate transactions: {legit_count:,}")
print(f"Fraud transactions: {fraud_count:,}")
print(f"Imbalance ratio: {imbalance_ratio:.1f}:1")

if imbalance_ratio > 100:
    print("  SEVERE imbalance - will need special handling in modeling")
else:
    print("✓ Moderate imbalance - standard techniques will work")


print("\n" + "="*70)
print("CHECK 2: FEATURE DISTRIBUTIONS")
print("="*70)
# Check Amount column specifically (our only non-PCA feature besides Time)
print(f"Amount column statistics:")
print(f"  Min: ${df['Amount'].min():.2f}")
print(f"  Max: ${df['Amount'].max():.2f}")
print(f"  Mean: ${df['Amount'].mean():.2f}")
print(f"  Median: ${df['Amount'].median():.2f}")
print(f"  Std Dev: ${df['Amount'].std():.2f}")

# Check for extreme outliers
q99 = df['Amount'].quantile(0.99)
extreme_outliers = (df['Amount'] > q99).sum()
print(f"\nTransactions above 99th percentile: {extreme_outliers:,}")
print("✓ We'll handle these with robust scaling")


print("\n" + "="*70)
print("CHECK 3: ZERO VARIANCE IN INPUT FEATURES")
print("="*70)
# Check if any INPUT FEATURES have zero or near-zero variance
# (We exclude Class because it's our TARGET, not a feature)
feature_cols = [col for col in df.columns if col != 'Class']
variances = df[feature_cols].var()
low_variance = variances[variances < 0.01]

if len(low_variance) == 0:
    print("✓ All input features have sufficient variance")
else:
    print(f"  Input features with low variance: {len(low_variance)}")
    print(low_variance)

print(f"\nNote: Class (target variable) has low variance by design")
print(f"      This is normal for imbalanced classification problems")

print("\n" + "="*70)
print("CHECK 4: FEATURE TYPES")
print("="*70)
print("INPUT FEATURES (30 total):")
print("  • V1-V28: Already PCA-transformed (DON'T scale again)")
print("  • Time: Needs scaling (seconds since first transaction)")
print("  • Amount: Needs scaling (transaction dollar amounts)")
print("\nTARGET VARIABLE (1 total):")
print("  • Class: What we're predicting (0=Legit, 1=Fraud)")
print("\n✓ We'll only scale Time and Amount in next step")


print("\n" + "="*70)
print("CHECK 5: FINAL DATA SHAPE")
print("="*70)
print(f"Total columns: {df.shape[1]}")
print(f"  ├── Input features: 30 (V1-V28, Time, Amount)")
print(f"  └── Target variable: 1 (Class)")
print(f"\nTotal transactions: {df.shape[0]:,}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n" + "="*70)
print("✓ DATA IS CLEAN AND READY FOR FEATURE SCALING!")
print("="*70)

Loading data...
✓ Data loaded: (284807, 31)

DATA QUALITY CHECK
Total transactions: 284,807
Total features: 31
Missing values: 0
Duplicate rows: 1081

REMOVING DUPLICATES
Duplicates found: 1081
✓ Removed 1,081 duplicate rows
✓ Clean dataset size: (283726, 31)
✓ Duplicates remaining: 0

CLASS DISTRIBUTION
Class
0    283253
1       473
Name: count, dtype: int64

Fraud rate: 0.167%

CHECK 1: CLASS IMBALANCE SEVERITY
Legitimate transactions: 283,253
Fraud transactions: 473
Imbalance ratio: 598.8:1
  SEVERE imbalance - will need special handling in modeling

CHECK 2: FEATURE DISTRIBUTIONS
Amount column statistics:
  Min: $0.00
  Max: $25691.16
  Mean: $88.47
  Median: $22.00
  Std Dev: $250.40

Transactions above 99th percentile: 2,838
✓ We'll handle these with robust scaling

CHECK 3: ZERO VARIANCE IN INPUT FEATURES
✓ All input features have sufficient variance

Note: Class (target variable) has low variance by design
      This is normal for imbalanced classification problems

CHECK 4: FE

In [30]:
# PHASE 3 PART 12: FEATURE SCALING

from sklearn.preprocessing import StandardScaler

print("="*70)
print("FEATURE SCALING")
print("="*70)

# Check current scales
print("\nBefore scaling:")
print(f"Time range: {df['Time'].min():.2f} to {df['Time'].max():.2f}")
print(f"Amount range: €{df['Amount'].min():.2f} to €{df['Amount'].max():.2f}")
print(f"V1 range (for comparison): {df['V1'].min():.2f} to {df['V1'].max():.2f}")

# Create a copy for scaling
df_scaled = df.copy()

# Scale Time and Amount
scaler = StandardScaler()
df_scaled['Time_scaled'] = scaler.fit_transform(df[['Time']])
df_scaled['Amount_scaled'] = scaler.fit_transform(df[['Amount']])

# Drop original Time and Amount
df_scaled = df_scaled.drop(['Time', 'Amount'], axis=1)

# Reorder columns (put scaled features after V28, before Class)
feature_cols = [f'V{i}' for i in range(1, 29)] + ['Time_scaled', 'Amount_scaled']
df_scaled = df_scaled[feature_cols + ['Class']]

print("\nAfter scaling:")
print(f"Time_scaled range: {df_scaled['Time_scaled'].min():.2f} to {df_scaled['Time_scaled'].max():.2f}")
print(f"Amount_scaled range: {df_scaled['Amount_scaled'].min():.2f} to {df_scaled['Amount_scaled'].max():.2f}")

print("\n✓ Features scaled successfully!")
print(f"   All features now on similar scale")
print(f"   Final feature count: {df_scaled.shape[1] - 1}")  # -1 for Class column

print("\nFirst few rows of scaled data:")
print(df_scaled.head())

FEATURE SCALING

Before scaling:
Time range: 0.00 to 172792.00
Amount range: €0.00 to €25691.16
V1 range (for comparison): -56.41 to 2.45

After scaling:
Time_scaled range: -2.00 to 1.64
Amount_scaled range: -0.35 to 102.25

✓ Features scaled successfully!
   All features now on similar scale
   Final feature count: 30

First few rows of scaled data:
         V1        V2        V3        V4        V5        V6        V7  \
0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9       V10  ...       V22       V23       V24       V25  \
0  0.098698  0.363787  0.090794  ...  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425 -0.166974  ...

In [31]:
# PHASE 3 PAER 13: FEATURE TARGET SPLIT

print("="*70)
print("SPLITTING FEATURES AND TARGET")
print("="*70)

# X = features (everything except Class)
# y = target (what we want to predict - Class)
X = df_scaled.drop('Class', axis=1)
y = df_scaled['Class']

print(f"\nFeatures (X) shape: {X.shape}")
print(f"  Total samples: {X.shape[0]:,}")
print(f"  Total features: {X.shape[1]}")

print(f"\nTarget (y) shape: {y.shape}")
print(f"  Total samples: {y.shape[0]:,}")
print(f"  Target name: {y.name}")

print(f"\nTarget distribution:")
print(f"  Legitimate (0): {(y == 0).sum():,} ({(y == 0).sum()/len(y)*100:.2f}%)")
print(f"  Fraud (1): {(y == 1).sum():,} ({(y == 1).sum()/len(y)*100:.3f}%)")

print(f"\nFeature names:")
print(X.columns.tolist())

print("\n✓ Data split complete!")

SPLITTING FEATURES AND TARGET

Features (X) shape: (283726, 30)
  Total samples: 283,726
  Total features: 30

Target (y) shape: (283726,)
  Total samples: 283,726
  Target name: Class

Target distribution:
  Legitimate (0): 283,253 (99.83%)
  Fraud (1): 473 (0.167%)

Feature names:
['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Time_scaled', 'Amount_scaled']

✓ Data split complete!


In [32]:
# PHASE 3 STEP 14 TRAIN-TEST SPLIT

from sklearn.model_selection import train_test_split

print("="*70)
print("TRAIN-TEST SPLIT")
print("="*70)

# Split: 80% for training, 20% for testing
# stratify=y ensures same fraud ratio in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      # 20% for testing
    random_state=42,    # Makes results repeatable
    stratify=y          # Keeps same fraud proportion in both sets
)

print(f"\nTraining set:")
print(f"  Total samples: {X_train.shape[0]:,}")
print(f"  Features: {X_train.shape[1]}")
print(f"  Fraud rate: {y_train.mean()*100:.3f}%")
print(f"  Frauds: {y_train.sum():,}")
print(f"  Legitimate: {(y_train == 0).sum():,}")

# NEW: Calculate and display training set ratio
train_legit = (y_train == 0).sum()
train_fraud = y_train.sum()
train_ratio = train_legit / train_fraud
print(f"  Ratio (Legitimate:Fraud): {train_ratio:.1f}:1")

print(f"\nTesting set:")
print(f"  Total samples: {X_test.shape[0]:,}")
print(f"  Features: {X_test.shape[1]}")
print(f"  Fraud rate: {y_test.mean()*100:.3f}%")
print(f"  Frauds: {y_test.sum():,}")
print(f"  Legitimate: {(y_test == 0).sum():,}")

# NEW: Calculate and display testing set ratio
test_legit = (y_test == 0).sum()
test_fraud = y_test.sum()
test_ratio = test_legit / test_fraud
print(f"  Ratio (Legitimate:Fraud): {test_ratio:.1f}:1")

print(f"\n✓ Split maintains fraud ratio in both sets!")
print(f"   Training: {len(X_train):,} samples ({len(X_train)/len(X)*100:.0f}%)")
print(f"   Testing: {len(X_test):,} samples ({len(X_test)/len(X)*100:.0f}%)")

# NEW: Summary comparison
print(f"\n{'='*70}")
print("IMBALANCE SUMMARY")
print(f"{'='*70}")
print(f"Training set ratio: {train_ratio:.1f}:1 ({train_legit:,} legit : {train_fraud:,} fraud)")
print(f"Testing set ratio:  {test_ratio:.1f}:1 ({test_legit:,} legit : {test_fraud:,} fraud)")
print(f"\nBoth sets maintain ~{train_ratio:.0f}:1 class imbalance")

TRAIN-TEST SPLIT

Training set:
  Total samples: 226,980
  Features: 30
  Fraud rate: 0.167%
  Frauds: 378
  Legitimate: 226,602
  Ratio (Legitimate:Fraud): 599.5:1

Testing set:
  Total samples: 56,746
  Features: 30
  Fraud rate: 0.167%
  Frauds: 95
  Legitimate: 56,651
  Ratio (Legitimate:Fraud): 596.3:1

✓ Split maintains fraud ratio in both sets!
   Training: 226,980 samples (80%)
   Testing: 56,746 samples (20%)

IMBALANCE SUMMARY
Training set ratio: 599.5:1 (226,602 legit : 378 fraud)
Testing set ratio:  596.3:1 (56,651 legit : 95 fraud)

Both sets maintain ~599:1 class imbalance


In [33]:
# PHASE 3 STEP 15 HANDLE EXTREME IMBALANCE WITH SMOTE AND ADASYN

from imblearn.over_sampling import SMOTE, ADASYN

print("="*70)
print("STEP 15: COMPARING SMOTE VS ADASYN")
print("="*70)

print("\n ORIGINAL IMBALANCE:")
print(f"   Training frauds: {y_train.sum():,}")
print(f"   Training legitimate: {(y_train==0).sum():,}")
print(f"   Imbalance ratio: {(y_train==0).sum() / y_train.sum():.0f}:1")

print("\n" + "="*70)
print("METHOD 1: SMOTE (Synthetic Minority Over-sampling Technique)")
print("="*70)

print("\n SMOTE APPROACH:")
print("   • Creates equal synthetic samples near ALL fraud examples")
print("   • Uniform distribution across fraud space")
print("   • Industry-standard for extreme imbalance")

print("\nBefore SMOTE:")
print(f"  Legitimate (0): {(y_train == 0).sum():,}")
print(f"  Fraud (1): {y_train.sum():,}")
print(f"  Ratio: {(y_train==0).sum() / y_train.sum():.0f}:1")

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("\nAfter SMOTE:")
print(f"  Legitimate (0): {(y_train_smote == 0).sum():,}")
print(f"  Fraud (1): {(y_train_smote == 1).sum():,}")
print(f"  Ratio: 1:1 (perfectly balanced!)")

print(f"\n✓ SMOTE complete!")
print(f"   Original training size: {X_train.shape[0]:,}")
print(f"   SMOTE training size: {X_train_smote.shape[0]:,}")
print(f"   Synthetic frauds created: {X_train_smote.shape[0] - X_train.shape[0]:,}")

print("\n" + "="*70)
print("METHOD 2: ADASYN (Adaptive Synthetic Sampling)")
print("="*70)

print("\n ADASYN APPROACH:")
print("   • Creates MORE synthetic samples near hard-to-classify frauds")
print("   • Adaptive distribution based on classification difficulty")
print("   • Focuses on challenging boundary cases")

print("\nBefore ADASYN:")
print(f"  Legitimate (0): {(y_train == 0).sum():,}")
print(f"  Fraud (1): {y_train.sum():,}")
print(f"  Ratio: {(y_train==0).sum() / y_train.sum():.0f}:1")

# Apply ADASYN
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)

print("\nAfter ADASYN:")
print(f"  Legitimate (0): {(y_train_adasyn == 0).sum():,}")
print(f"  Fraud (1): {(y_train_adasyn == 1).sum():,}")
print(f"  Ratio: ~1:1 (adaptively balanced!)")

print(f"\n✓ ADASYN complete!")
print(f"   Original training size: {X_train.shape[0]:,}")
print(f"   ADASYN training size: {X_train_adasyn.shape[0]:,}")
print(f"   Synthetic frauds created: {X_train_adasyn.shape[0] - X_train.shape[0]:,}")

print("\n" + "="*70)
print("COMPARISON SUMMARY")
print("="*70)

print("\nDataset Sizes:")
print(f"  Original training:  {X_train.shape[0]:>8,} samples")
print(f"  SMOTE balanced:     {X_train_smote.shape[0]:>8,} samples")
print(f"  ADASYN balanced:    {X_train_adasyn.shape[0]:>8,} samples")

print("\nSynthetic Frauds Created:")
print(f"  SMOTE:  {X_train_smote.shape[0] - X_train.shape[0]:>8,} synthetic frauds")
print(f"  ADASYN: {X_train_adasyn.shape[0] - X_train.shape[0]:>8,} synthetic frauds")

print("\nFraud Distribution:")
smote_fraud_count = (y_train_smote == 1).sum()
adasyn_fraud_count = (y_train_adasyn == 1).sum()
print(f"  SMOTE fraud samples:  {smote_fraud_count:>8,}")
print(f"  ADASYN fraud samples: {adasyn_fraud_count:>8,}")

print("\n" + "="*70)
print("VERIFICATION CHECKS")
print("="*70)

print("\n SMOTE Training Data:")
print(f"  X_train_smote shape:  {X_train_smote.shape}")
print(f"  y_train_smote shape:  {y_train_smote.shape}")
print(f"  Fraud ratio: {(y_train_smote == 0).sum() / (y_train_smote == 1).sum():.2f}:1")

print("\n ADASYN Training Data:")
print(f"  X_train_adasyn shape: {X_train_adasyn.shape}")
print(f"  y_train_adasyn shape: {y_train_adasyn.shape}")
print(f"  Fraud ratio: {(y_train_adasyn == 0).sum() / (y_train_adasyn == 1).sum():.2f}:1")

print("\n Test Data (UNCHANGED for both methods):")
print(f"  X_test shape: {X_test.shape}")
print(f"  y_test frauds: {y_test.sum()}")
print(f"  y_test ratio: {(y_test == 0).sum() / y_test.sum():.0f}:1")

print("\n" + "="*70)
print("KEY DIFFERENCES")
print("="*70)

print("\nSMOTE:")
print("  ✓ Uniform synthetic generation")
print("  ✓ Predictable 1:1 balance")
print("  ✓ Industry standard approach")
print("  ✓ Simpler algorithm (faster)")

print("\nADASYN:")
print("  ✓ Adaptive synthetic generation")
print("  ✓ Focuses on hard-to-classify cases")
print("  ✓ May create slightly different total counts")
print("  ✓ Potentially better quality synthetics")

print("\n NEXT STEPS:")
print("   We will train models on BOTH datasets and compare:")
print("   • Fraud recall (primary metric)")
print("   • Precision")
print("   • Training vs test performance gap")
print("   • Overall model quality")

print("\n✓ Both balanced datasets ready for model training!")

print("\n" + "="*70)
print("USAGE NOTES")
print("="*70)

print("\n PURPOSE OF THESE BALANCED DATASETS:")
print("   These pre-balanced datasets (X_train_smote, X_train_adasyn)")
print("   will be used for FINAL model training AFTER cross-validation.")

print("\n CROSS-VALIDATION WORKFLOW:")
print("   1. Cross-validation will use ORIGINAL X_train (226,980 samples)")
print("      - Apply SMOTE/ADASYN fresh inside each CV fold")
print("      - Validate on REAL frauds (not synthetic)")
print("      - Get expected performance estimates")
print("   ")
print("   2. Final model training will use THESE pre-balanced datasets")
print("      - X_train_smote (453,204 samples) for SMOTE models")
print("      - X_train_adasyn (453,240 samples) for ADASYN models")
print("   ")
print("   3. Final testing will use X_test (56,746 samples, 95 REAL frauds)")
print("      - Test ONCE on best model")
print("      - Get honest real-world performance")

print("\n WHY CREATE THESE NOW?")
print("   - Shows SMOTE vs ADASYN comparison clearly")
print("   - Ready for final model training (saves time later)")
print("   - Educational: see exact synthetic fraud counts")

print("\n  IMPORTANT:")
print("   Test set (X_test, y_test) remains COMPLETELY UNTOUCHED")
print("   - NO SMOTE applied to test data")
print("   - Preserves real-world fraud rate (0.167%)")
print("   - Used ONLY for final honest evaluation")

STEP 15: COMPARING SMOTE VS ADASYN

 ORIGINAL IMBALANCE:
   Training frauds: 378
   Training legitimate: 226,602
   Imbalance ratio: 599:1

METHOD 1: SMOTE (Synthetic Minority Over-sampling Technique)

 SMOTE APPROACH:
   • Creates equal synthetic samples near ALL fraud examples
   • Uniform distribution across fraud space
   • Industry-standard for extreme imbalance

Before SMOTE:
  Legitimate (0): 226,602
  Fraud (1): 378
  Ratio: 599:1

After SMOTE:
  Legitimate (0): 226,602
  Fraud (1): 226,602
  Ratio: 1:1 (perfectly balanced!)

✓ SMOTE complete!
   Original training size: 226,980
   SMOTE training size: 453,204
   Synthetic frauds created: 226,224

METHOD 2: ADASYN (Adaptive Synthetic Sampling)

 ADASYN APPROACH:
   • Creates MORE synthetic samples near hard-to-classify frauds
   • Adaptive distribution based on classification difficulty
   • Focuses on challenging boundary cases

Before ADASYN:
  Legitimate (0): 226,602
  Fraud (1): 378
  Ratio: 599:1

After ADASYN:
  Legitimate