# ðŸ§  Feature Engineering

This notebook builds additional engineered variables and prepares train/test splits with scaling.

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.figsize"] = (12, 6)

## Load the wrangled modeling table

If running end-to-end in one session, `df_model` will already exist.
Otherwise, reload from a saved CSV/parquet that you create at the end of `mrm_03`.

In [None]:
# Option A: if df_model exists in-memory, skip.
# Option B: load from disk (recommended in a real project):
# df_model = pd.read_parquet(DATA_DIR + "df_model.parquet")

## Engineer variables

In [None]:
df_fe = df_model.copy()

# Log UPB to reduce skew
df_fe["ORIG_UPB_log"] = np.log(df_fe["ORIG_UPB"])

# Ratios / proxies
df_fe["DTI_OLTV"] = df_fe["DTI"] / df_fe["OLTV"]
df_fe["home_price_proxy"] = df_fe["ORIG_UPB"] / df_fe["OLTV"]
df_fe["home_price_log"] = np.log(df_fe["home_price_proxy"])
df_fe["home_price_fico"] = df_fe["CSCORE_B"] / df_fe["home_price_log"]

df_fe.replace([np.inf, -np.inf], np.nan, inplace=True)
df_fe.fillna(0, inplace=True)

df_fe.describe().T

## Train/test split + scaling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df_fe.drop(columns=["bad"])
y = df_fe["bad"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

X_train_scaled.shape, X_test_scaled.shape

## Save artifacts (recommended)

Saving the scaler and the engineered dataset makes downstream notebooks reproducible.

In [None]:
import joblib

# joblib.dump(scaler, DATA_DIR + "scaler.joblib")
# df_fe.to_parquet(DATA_DIR + "df_fe.parquet", index=False)