# Amex Pipeline: Staged Execution
This notebook allows you to run the Amex pipeline in stages: data loading, cleaning, feature engineering, EDA, feature selection, model training, validation, and submission generation.

In [1]:
# Updated amex_pipeline.ipynb cell
import os
import pandas as pd
from data.data_loader import load_all_data
from data.data_cleaning import clean_all_data_advanced
from data.advanced_feature_engineering import create_full_feature_set_advanced
from eda.exploratory_analysis import (
    plot_target_distribution, plot_missing_values, plot_feature_distributions, 
    plot_correlation_heatmap, plot_new_feature_analysis
)
from models.model_training import split_data, train_logistic_regression
from utils.metrics import map7_from_dataframe
from utils.submission import generate_submission

In [2]:
# Load data
print("Loading data...")
data = load_all_data()
print(f"Train shape: {data['train'].shape}")

Loading data...
Train shape: (770164, 372)


In [None]:
# Advanced cleaning with clustering-based imputation
print("Performing advanced cleaning...")
cleaned_data = clean_all_data_advanced(data)
print(f"Cleaned train shape: {cleaned_data['train'].shape}")

Performing advanced cleaning...
Starting advanced data cleaning...
Creating customer behavioral features...
Creating customer segments...
Performing advanced segment-based imputation...
Phase 1: Offer-based imputation...
Phase 2: Segment-based imputation...
Phase 3: KNN imputation for remaining features...


In [None]:
# Advanced feature engineering
print("Performing advanced feature engineering...")
train_engineered, selected_features = create_full_feature_set_advanced(cleaned_data['train'])
test_engineered, _ = create_full_feature_set_advanced(cleaned_data['test'])

print(f"Final train shape: {train_engineered.shape}")
print(f"Selected features: {len(selected_features)}")

# Verify no missing values remain
print("\nMissing values check:")
print("Train missing values:", train_engineered.isnull().sum().sum())
print("Test missing values:", test_engineered.isnull().sum().sum())

# Check for -999 values (should be minimal after advanced imputation)
print("\n-999 values check:")
numeric_cols = train_engineered.select_dtypes(include=[np.number]).columns
print("Train -999 values:", (train_engineered[numeric_cols] == -999).sum().sum())
print("Test -999 values:", (test_engineered[numeric_cols] == -999).sum().sum())

Feature engineering step omitted. Using cleaned data.


Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,f357,f358,f359,f360,f361,f362,f363,f364,f365,f366
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0.0,1.0,-999.0,-999.0,-999.0,...,-999.0,-9999.0,0.0,-999.0,28.0,0.0,0.0,337.0,0.0,0.0
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0.0,1.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,0.0,-999.0,87.0,0.0,0.0,1010.0,2.0,0.00198
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0.0,1.0,-999.0,-999.0,-999.0,...,-999.0,-999.0,0.0,-999.0,23.0,0.0,0.0,1010.0,2.0,0.00198
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0.0,1.0,-999.0,-999.0,-999.0,...,-999.0,-9999.0,0.0,-999.0,277.0,1.0,0.00361,337.0,0.0,0.0
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0.0,1.0,-999.0,-999.0,-999.0,...,-999.0,-9999.0,0.0,-999.0,359.0,0.0,0.0,337.0,0.0,0.0


Unnamed: 0,id1,id2,id3,id4,id5,f1,f2,f3,f4,f5,...,f357,f358,f359,f360,f361,f362,f363,f364,f365,f366
0,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04 18:56:26.000794,2023-11-04,-999.0,-999.0,-999.0,-999.0,-999.0,...,0.002781,0.0466,0.0,-999.0,1.0,0.0,0.0,56.0,0.0,0.0
1,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04 06:08:53.373000,2023-11-04,-999.0,9.0,-999.0,-999.0,-999.0,...,0.001429,0.060309,0.0,-999.0,195.0,13.0,0.066667,-999.0,-999.0,-999.0
2,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05 10:07:28.000725,2023-11-05,-999.0,-999.0,-999.0,-999.0,22.0,...,-0.017496,0.073484,0.0,-999.0,155.0,67.0,0.432258,1142.0,436.0,0.381786
3,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04 12:25:28.244000,2023-11-04,-999.0,-999.0,-999.0,-999.0,-999.0,...,0.001316,0.040572,0.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
4,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05 06:45:26.657000,2023-11-05,-999.0,-999.0,-999.0,-999.0,-999.0,...,0.002054,0.038244,0.0,-999.0,29.0,2.0,0.068966,361.0,3.0,0.00831
