# Amex Pipeline: Staged Execution
This notebook allows you to run the Amex pipeline in stages: data loading, cleaning, feature engineering, EDA, feature selection, model training, validation, and submission generation.

In [1]:
import os
import pandas as pd
import numpy as np
from data.data_loader import load_all_data
from data.data_cleaning import clean_all_data_advanced
from data.advanced_feature_engineering import create_full_feature_set_advanced
from eda.exploratory_analysis import (
    plot_target_distribution, plot_missing_values, plot_feature_distributions, 
    plot_correlation_heatmap, plot_new_feature_analysis
)
from utils.metrics import map7_from_dataframe
from utils.submission import generate_submission
import warnings
warnings.filterwarnings('ignore')

def validate_pipeline_data(data, stage_name):
    """Validate data at each pipeline stage"""
    print(f"\n=== {stage_name} Validation ===")
    print(f"Shape: {data.shape}")
    
    # Check for NaN values
    nan_count = data.isna().sum().sum()
    print(f"NaN values: {nan_count}")
    
    # Check for infinite values
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    inf_count = np.isinf(data[numeric_cols]).sum().sum()
    print(f"Infinite values: {inf_count}")
    
    # Memory usage
    memory_mb = data.memory_usage(deep=True).sum() / 1024 / 1024
    print(f"Memory usage: {memory_mb:.1f} MB")
    
    if nan_count > 0 or inf_count > 0:
        print("⚠️  Data quality issues detected!")
        return False
    else:
        print("✅ Data validation passed")
        return True

In [2]:
# Stage 1: Load data with validation
print("=== STAGE 1: DATA LOADING ===")
try:
    data = load_all_data()
    print(f"✅ Data loaded successfully")
    print(f"Train shape: {data['train'].shape}")
    print(f"Test shape: {data['test'].shape}")
    
    # Validate loaded data
    validate_pipeline_data(data['train'], "Raw Train Data")
    
except Exception as e:
    print(f"❌ Data loading failed: {e}")
    raise

=== STAGE 1: DATA LOADING ===
✅ Data loaded successfully
Train shape: (770164, 372)
Test shape: (369301, 371)

=== Raw Train Data Validation ===
Shape: (770164, 372)
NaN values: 68296406
Infinite values: 0
Memory usage: 1084.8 MB
⚠️  Data quality issues detected!


In [3]:
# Stage 2: Advanced cleaning with comprehensive error handling
print("\n=== STAGE 2: ADVANCED DATA CLEANING ===")
try:
    cleaned_data = clean_all_data_advanced(data)
    print(f"✅ Advanced cleaning completed")
    print(f"Cleaned train shape: {cleaned_data['train'].shape}")
    
    # Validate cleaned data
    is_valid = validate_pipeline_data(cleaned_data['train'], "Cleaned Train Data")
    
    if not is_valid:
        print("⚠️  Performing emergency data cleanup...")
        # Emergency cleanup
        numeric_cols = cleaned_data['train'].select_dtypes(include=[np.number]).columns
        cleaned_data['train'][numeric_cols] = cleaned_data['train'][numeric_cols].fillna(0)
        cleaned_data['train'][numeric_cols] = cleaned_data['train'][numeric_cols].replace([np.inf, -np.inf], 0)
        
        cleaned_data['test'][numeric_cols] = cleaned_data['test'][numeric_cols].fillna(0)
        cleaned_data['test'][numeric_cols] = cleaned_data['test'][numeric_cols].replace([np.inf, -np.inf], 0)
        
        print("✅ Emergency cleanup completed")
        validate_pipeline_data(cleaned_data['train'], "Emergency Cleaned Data")
    
except Exception as e:
    print(f"❌ Data cleaning failed: {e}")
    print("Falling back to basic cleaning...")
    
    # Emergency fallback
    cleaned_data = {
        'train': data['train'].fillna(0),
        'test': data['test'].fillna(0)
    }
    print("✅ Basic cleaning completed as fallback")



=== STAGE 2: ADVANCED DATA CLEANING ===
Starting robust advanced data cleaning pipeline...
Starting robust advanced data cleaning...
Removed 0 duplicate rows
Creating customer behavioral features...


KeyboardInterrupt: 

In [None]:
# Stage 3: Advanced feature engineering with robust error handling
print("\n=== STAGE 3: ADVANCED FEATURE ENGINEERING ===")
try:
    # Validate input data before feature engineering
    print("Validating input data...")
    input_valid = validate_pipeline_data(cleaned_data['train'], "Pre-Feature Engineering")
    
    if not input_valid:
        print("⚠️  Input data has issues, performing pre-processing cleanup...")
        numeric_cols = cleaned_data['train'].select_dtypes(include=[np.number]).columns
        cleaned_data['train'][numeric_cols] = cleaned_data['train'][numeric_cols].fillna(0)
        cleaned_data['train'][numeric_cols] = cleaned_data['train'][numeric_cols].replace([np.inf, -np.inf], 0)
    
    print("Starting feature engineering...")
    train_engineered, selected_features = create_full_feature_set_advanced(cleaned_data['train'])
    test_engineered, _ = create_full_feature_set_advanced(cleaned_data['test'])
    
    print(f"✅ Feature engineering completed")
    print(f"Final train shape: {train_engineered.shape}")
    print(f"Selected features: {len(selected_features)}")
    
    validate_pipeline_data(train_engineered, "Engineered Train Data")
    
    print("\n=== Feature Engineering Quality Report ===")
    numeric_cols = train_engineered.select_dtypes(include=[np.number]).columns
    missing_indicator_count = (train_engineered[numeric_cols] == -999).sum().sum()
    print(f"Remaining -999 values: {missing_indicator_count}")
    print(f"Data types: {train_engineered.dtypes.value_counts().to_dict()}")
    
    if selected_features:
        print(f"\nTop 10 selected features:")
        for i, feature in enumerate(selected_features[:10]):
            print(f"{i+1:2d}. {feature}")
    
except Exception as e:
    print(f"❌ Feature engineering failed: {e}")
    print("Using original cleaned data without advanced features...")
    
    train_engineered = cleaned_data['train'].copy()
    test_engineered = cleaned_data['test'].copy()
    selected_features = [col for col in train_engineered.columns if col.startswith('f')][:50]
    print(f"✅ Using {len(selected_features)} basic features as fallback")


AttributeError: 'dict' object has no attribute 'isna'

In [None]:
# Stage 4: Final validation and summary
print("\n=== STAGE 4: FINAL VALIDATION ===")

def final_pipeline_validation(train_data, test_data, features):
    """Comprehensive validation before model training"""
    print("Performing comprehensive pipeline validation...")
    
    issues = []
    
    print(f"Train data shape: {train_data.shape}")
    print(f"Test data shape: {test_data.shape}")
    
    if 'y' in train_data.columns:
        target_dist = train_data['y'].value_counts()
        print(f"Target distribution: {target_dist.to_dict()}")
        if len(target_dist) < 2:
            issues.append("Target variable has insufficient classes")
    else:
        issues.append("Target variable 'y' not found")
    
    available_features = [f for f in features if f in train_data.columns]
    print(f"Available features: {len(available_features)}/{len(features)}")
    
    if len(available_features) < len(features) * 0.8:
        issues.append(f"Too many features missing: {len(features) - len(available_features)}")
    
    train_nan = train_data.isna().sum().sum()
    test_nan = test_data.isna().sum().sum()
    
    if train_nan > 0 or test_nan > 0:
        issues.append(f"NaN values found - Train: {train_nan}, Test: {test_nan}")
    
    numeric_cols = train_data.select_dtypes(include=[np.number]).columns
    train_inf = np.isinf(train_data[numeric_cols]).sum().sum()
    test_inf = np.isinf(test_data[numeric_cols]).sum().sum()
    
    if train_inf > 0 or test_inf > 0:
        issues.append(f"Infinite values found - Train: {train_inf}, Test: {test_inf}")
    
    if issues:
        print("\n⚠️  VALIDATION ISSUES DETECTED:")
        for i, issue in enumerate(issues, 1):
            print(f"{i}. {issue}")
        return False, available_features
    else:
        print("\n✅ ALL VALIDATION CHECKS PASSED")
        return True, available_features

validation_passed, final_features = final_pipeline_validation(
    train_engineered, test_engineered, selected_features
)

if not validation_passed:
    print("\n🔧 APPLYING FINAL FIXES...")
    numeric_cols = train_engineered.select_dtypes(include=[np.number]).columns
    train_engineered[numeric_cols] = train_engineered[numeric_cols].fillna(0).replace([np.inf, -np.inf], 0)
    test_engineered[numeric_cols] = test_engineered[numeric_cols].fillna(0).replace([np.inf, -np.inf], 0)

    validation_passed, final_features = final_pipeline_validation(
        train_engineered, test_engineered, selected_features
    )

print(f"\n=== PIPELINE SUMMARY ===")
print(f"Status: {'✅ READY FOR MODEL TRAINING' if validation_passed else '❌ ISSUES REMAIN'}")
print(f"Final train shape: {train_engineered.shape}")
print(f"Final test shape: {test_engineered.shape}")
print(f"Features for modeling: {len(final_features)}")
print(f"Memory usage: {(train_engineered.memory_usage(deep=True).sum() / 1024 / 1024):.1f} MB")


Performing advanced feature engineering...
Starting advanced feature engineering...
Creating interaction features...
Created 4 interaction features
Creating temporal features...
Created 4 temporal features
Creating aggregated features...
Created 12 aggregated features
Performing feature selection...


ValueError: Input contains NaN

In [None]:
# Stage 5: Model Training (if validation passed)
print("\n=== STAGE 5: MODEL TRAINING ===")

if validation_passed and len(final_features) > 0:
    try:
        from sklearn.model_selection import train_test_split
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.metrics import roc_auc_score, classification_report
        
        X = train_engineered[final_features].copy()
        y = train_engineered['y']
        
        categorical_cols = X.select_dtypes(include=['object']).columns
        if len(categorical_cols) > 0:
            print(f"Encoding {len(categorical_cols)} categorical features...")
            from sklearn.preprocessing import LabelEncoder
            for col in categorical_cols:
                le = LabelEncoder()
                X[col] = le.fit_transform(X[col].astype(str))
        
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        print(f"Training set: {X_train.shape}")
        print(f"Validation set: {X_val.shape}")
        
        print("Training Random Forest model...")
        model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            random_state=42,
            n_jobs=-1
        )
        
        model.fit(X_train, y_train)
        
        val_pred = model.predict_proba(X_val)[:, 1]
        auc_score = roc_auc_score(y_val, val_pred)
        
        print(f"\n✅ Model training completed!")
        print(f"Validation AUC: {auc_score:.4f}")
        
        feature_importance = pd.DataFrame({
            'feature': final_features,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(f"\nTop 10 most important features:")
        print(feature_importance.head(10))
        
    except Exception as e:
        print(f"❌ Model training failed: {e}")
        print("Pipeline completed data preparation successfully, but model training needs debugging.")
        
else:
    print("⚠️  Skipping model training due to validation issues.")
    print("Focus on fixing data quality issues first.")

print("\n🎉 PIPELINE EXECUTION COMPLETED!")
