## 02 – Feature Engineering & Preprocessing

### 1. Objective
In this notebook we:
- Load the merged, labeled training dataset (`merged_train.csv`)
- Create a train/validation split
- Build a leakage-safe preprocessing pipeline
- Produce model-ready feature matrices for downstream modeling

In [20]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

### 2. Load Merged Training Data
This dataset was created in `01_eda.ipynb` by merging:
- `train_transaction.csv`
- `train_identity.csv`

In [21]:
DATA_PATH = "../data/processed/merged_train.csv"
df = pd.read_csv(DATA_PATH)

print("Loaded merged_train shape:", df.shape)
df.head()

Loaded merged_train shape: (590540, 435)


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,TransactionHour
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,,yahoo.com,,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,...,70787.0,,,,,,,,,100.0,NotFound,,-480.0,New,NotFound,166.0,,542.0,144.0,,,,,,,,New,NotFound,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M,0


### 3. Basic Checks
Confirm target column exists and inspect missingness at a high level.

In [22]:
assert "isFraud" in df.columns, "Target column isFraud not found!"
print("Fraud rate (%):", round(df["isFraud"].mean() * 100, 4))

print("Missing values (%):")
print(round(df.isnull().mean().sort_values(ascending=False).head(10) * 100, 2))

Fraud rate (%): 3.499
Missing values (%):
id_24    99.20
id_25    99.13
id_08    99.13
id_07    99.13
id_21    99.13
id_26    99.13
id_23    99.12
id_22    99.12
id_27    99.12
dist2    93.63
dtype: float64


### 4. Split into Train/Validation (Leakage-Safe)
We split using stratification to preserve the fraud ratio in both sets.

In [23]:
TARGET = "isFraud"
ID_COL = "TransactionID"

X = df.drop(columns=[TARGET])
y = df[TARGET].astype(int)

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape, "Val shape:", X_val.shape)
print("Train fraud %:", round(y_train.mean() * 100, 4))
print("Val fraud %:", round(y_val.mean() * 100, 4))

Train shape: (472432, 434) Val shape: (118108, 434)
Train fraud %: 3.4989
Val fraud %: 3.4993


### 5. Define Feature Types
We separate numeric vs categorical columns to apply appropriate preprocessing steps.

In [24]:
# Drop identifiers if present (not useful as predictive features)
drop_cols = [c for c in [ID_COL] if c in X_train.columns]

X_train_ = X_train.drop(columns=drop_cols, errors="ignore")
X_val_ = X_val.drop(columns=drop_cols, errors="ignore")

numeric_features = X_train_.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train_.select_dtypes(include=["object"]).columns.tolist()

print("Num features:", len(numeric_features))
print("Cat features:", len(categorical_features))

Num features: 402
Cat features: 31


### 6. Optional: Reduce Extremely High-Cardinality Categoricals
Some categorical columns may have too many unique values. We keep only those with reasonable cardinality
to avoid massive one-hot expansions.

In [25]:
MAX_CARDINALITY = 50  # adjust if needed

low_card_cats = []
high_card_cats = []

for col in categorical_features:
    n_unique = X_train_[col].nunique(dropna=True)
    if n_unique <= MAX_CARDINALITY:
        low_card_cats.append(col)
    else:
        high_card_cats.append(col)

print("Low-cardinality categorical:", len(low_card_cats))
print("High-cardinality categorical (dropped for now):", len(high_card_cats))

Low-cardinality categorical: 25
High-cardinality categorical (dropped for now): 6


### 7. Build Preprocessing Pipeline
- Numeric: median imputation
- Categorical (low-card): most frequent imputation + one-hot encoding
We fit on training data only, then transform validation data.

In [26]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True)),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, low_card_cats),
    ],
    remainder="drop"  # drops high-card cats and anything not listed
)

preprocessor

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


### 8. Fit on Train, Transform Train/Validation
This produces model-ready matrices `X_train_prep` and `X_val_prep`.

In [27]:
X_train_prep = preprocessor.fit_transform(X_train_)
X_val_prep = preprocessor.transform(X_val_)

print("Prepared X_train:", X_train_prep.shape)
print("Prepared X_val:", X_val_prep.shape)

Prepared X_train: (472432, 464)
Prepared X_val: (118108, 464)


### 9. Extract Feature Names (Optional but Useful)
This helps with debugging and model interpretability.

In [28]:
num_names = numeric_features

ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
cat_names = ohe.get_feature_names_out(low_card_cats).tolist()

feature_names = num_names + cat_names
print("Total feature names:", len(feature_names))
print("First 20:", feature_names[:20])

Total feature names: 464
First 20: ['TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'dist1', 'dist2', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10']


### 10. Save Outputs for Modeling
We save:
- The fitted preprocessor (for consistent transformations later)
- Sparse matrices + targets for fast model training

In [30]:
import os
import joblib
import numpy as np

OUT_DIR = "../data/processed"
os.makedirs(OUT_DIR, exist_ok=True)

# Save fitted preprocessor
joblib.dump(preprocessor, os.path.join(OUT_DIR, "preprocessor.joblib"))

# Save X matrices (handle sparse vs dense automatically)
try:
    from scipy import sparse

    if sparse.issparse(X_train_prep):
        sparse.save_npz(os.path.join(OUT_DIR, "X_train_prep.npz"), X_train_prep)
        sparse.save_npz(os.path.join(OUT_DIR, "X_val_prep.npz"), X_val_prep)
        print("Saved X as sparse .npz")
    else:
        np.save(os.path.join(OUT_DIR, "X_train_prep.npy"), X_train_prep)
        np.save(os.path.join(OUT_DIR, "X_val_prep.npy"), X_val_prep)
        print("Saved X as dense .npy")
except Exception as e:
    # If scipy isn't available or any save_npz issue occurs, fall back to dense saves.
    np.save(os.path.join(OUT_DIR, "X_train_prep.npy"), X_train_prep)
    np.save(os.path.join(OUT_DIR, "X_val_prep.npy"), X_val_prep)
    print("Saved X as dense .npy (fallback). Error was:", repr(e))

# Save targets
np.save(os.path.join(OUT_DIR, "y_train.npy"), np.asarray(y_train))
np.save(os.path.join(OUT_DIR, "y_val.npy"), np.asarray(y_val))

# Save feature names (if you created them)
if "feature_names" in globals():
    import pandas as pd
    pd.Series(feature_names).to_csv(
        os.path.join(OUT_DIR, "feature_names.csv"),
        index=False,
        header=["feature"]
    )

print("Saved artifacts to:", OUT_DIR)

Saved X as dense .npy
Saved artifacts to: ../data/processed
