# Stage 6: Data Preparation — Credit Risk Modeling (Q1 2022)

This notebook prepares the Fannie Mae Q1 2022 loan-level dataset for modeling.

It produces TWO artifacts:
- **fannie_q1_2022_raw_clean.csv**: imputed, robust categorical mappings, raw numeric values preserved, and a proxy label created on raw values.
- **fannie_q1_2022_model_ready.csv**: one-hot encoded and scaled features, ready for modeling, includes the proxy label.

Key steps:
1) Load raw labeled Q1 dataset
2) Replace blanks → NaN; inspect dtypes, missingness, cardinality
3) Impute numeric & categorical; robust mappings (no NaNs from mapping)
4) Create proxy label on RAW values (NOT on scaled)
5) One-hot encode categoricals; scale numeric
6) Save both artifacts with sanity checks


In [1]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: f'{x:.2f}')

# Prefer mounted Kaggle input
RAW_CANDIDATE_PATHS = [
    "/kaggle/input/fannie-q1-2022-labeled/fannie_q1_2022_labeled.csv",
    "/kaggle/working/fannie_q1_2022_labeled.csv"
]

def first_existing_path(paths):
    for p in paths:
        if os.path.exists(p):
            return p
    raise FileNotFoundError("Raw labeled CSV not found in expected locations.")

RAW_DATA_PATH = first_existing_path(RAW_CANDIDATE_PATHS)
OUTPUT_DIR = "/kaggle/working"
os.makedirs(OUTPUT_DIR, exist_ok=True)

RAW_DATA_PATH, OUTPUT_DIR


('/kaggle/input/fannie-q1-2022-labeled/fannie_q1_2022_labeled.csv',
 '/kaggle/working')

#### Load Dataset

In [2]:
df = pd.read_csv(RAW_DATA_PATH)
print("Initial shape:", df.shape)
display(df.head(3))

# Replace blank strings with NaN for a fair missingness audit
df = df.replace('', np.nan)

print("\nInfo:")
df.info()

print("\nMissing values (top):")
display(df.isna().sum().sort_values(ascending=False).head(15))

print("\nCardinality (top):")
display(df.nunique().sort_values(ascending=False).head(15))


Initial shape: (11196, 22)


Unnamed: 0,LOAN_ID,ORIGINATION_DATE,FIRST_PAYMENT_DATE,ORIG_INTEREST_RATE,ORIG_UPB,ORIG_LOAN_TERM,ORIG_LTV,ORIG_CLTV,DTI,BORR_CREDIT_SCORE,CO_BORR_CREDIT_SCORE,NUM_BORROWERS,FIRST_TIME_HB_FLAG,LOAN_PURPOSE,PROPERTY_TYPE,NUM_UNITS,OCCUPANCY_STATUS,PROPERTY_STATE,MI_PCT,AMORTIZATION_TYPE,CHANNEL,default_12m
0,130357804,2022-01,2022-03,3.38,255000.0,360,72,72,43,703.0,,1,N,P,SF,1,P,NV,,N,R,0
1,130357811,2022-01,2022-03,3.25,184000.0,360,80,80,35,792.0,,1,N,R,SF,1,P,MN,,N,R,0
2,130357813,2022-01,2022-03,3.0,447000.0,360,53,53,43,691.0,,1,N,C,PU,1,P,CA,,N,R,0



Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11196 entries, 0 to 11195
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   LOAN_ID               11196 non-null  int64  
 1   ORIGINATION_DATE      11196 non-null  object 
 2   FIRST_PAYMENT_DATE    11196 non-null  object 
 3   ORIG_INTEREST_RATE    11196 non-null  float64
 4   ORIG_UPB              11196 non-null  float64
 5   ORIG_LOAN_TERM        11196 non-null  int64  
 6   ORIG_LTV              11196 non-null  int64  
 7   ORIG_CLTV             11196 non-null  int64  
 8   DTI                   11196 non-null  int64  
 9   BORR_CREDIT_SCORE     11181 non-null  float64
 10  CO_BORR_CREDIT_SCORE  4779 non-null   float64
 11  NUM_BORROWERS         11196 non-null  int64  
 12  FIRST_TIME_HB_FLAG    11196 non-null  object 
 13  LOAN_PURPOSE          11196 non-null  object 
 14  PROPERTY_TYPE         11196 non-null  object 
 15  NUM_UNITS   

MI_PCT                  8727
CO_BORR_CREDIT_SCORE    6417
BORR_CREDIT_SCORE         15
LOAN_ID                    0
FIRST_TIME_HB_FLAG         0
CHANNEL                    0
AMORTIZATION_TYPE          0
PROPERTY_STATE             0
OCCUPANCY_STATUS           0
NUM_UNITS                  0
PROPERTY_TYPE              0
LOAN_PURPOSE               0
NUM_BORROWERS              0
ORIGINATION_DATE           0
DTI                        0
dtype: int64


Cardinality (top):


LOAN_ID                 11196
ORIG_UPB                  746
ORIG_INTEREST_RATE        313
BORR_CREDIT_SCORE         220
CO_BORR_CREDIT_SCORE      218
ORIG_CLTV                  95
ORIG_LTV                   91
DTI                        52
PROPERTY_STATE             52
ORIG_LOAN_TERM             35
MI_PCT                      8
PROPERTY_TYPE               5
NUM_UNITS                   4
NUM_BORROWERS               4
LOAN_PURPOSE                3
dtype: int64

#### Basic Cleaning Decisions

In [3]:
df = df.copy()

# Drop sparse/unhelpful columns
cols_to_drop = []
if 'MI_PCT' in df.columns:
    # ~78% missing in your sample; drop
    cols_to_drop.append('MI_PCT')

df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# Co-borrower presence flag (before imputation)
if 'CO_BORR_CREDIT_SCORE' in df.columns:
    df['CO_BORR_PRESENT'] = (~df['CO_BORR_CREDIT_SCORE'].isna()).astype(int)
else:
    df['CO_BORR_PRESENT'] = 0  # fallback if column is absent

# Dates: keep for now (raw artifact will retain them)
date_cols = [c for c in ['ORIGINATION_DATE', 'FIRST_PAYMENT_DATE'] if c in df.columns]


#### Numerical & Categorical imputation

In [4]:
# Identify core numeric columns 
numeric_cols = [c for c in [
    'ORIG_INTEREST_RATE','ORIG_UPB','ORIG_LOAN_TERM','ORIG_LTV','ORIG_CLTV',
    'DTI','BORR_CREDIT_SCORE','CO_BORR_CREDIT_SCORE','NUM_BORROWERS','NUM_UNITS'
] if c in df.columns]

# Impute borrower credit score (median)
if 'BORR_CREDIT_SCORE' in df.columns:
    borr_med = df['BORR_CREDIT_SCORE'].median()
    df['BORR_CREDIT_SCORE'] = df['BORR_CREDIT_SCORE'].fillna(borr_med)

# Impute co-borrower: use borrower’s score first, then borrower median as backup
if 'CO_BORR_CREDIT_SCORE' in df.columns:
    df['CO_BORR_CREDIT_SCORE'] = df['CO_BORR_CREDIT_SCORE'].fillna(df['BORR_CREDIT_SCORE'])
    df['CO_BORR_CREDIT_SCORE'] = df['CO_BORR_CREDIT_SCORE'].fillna(borr_med)

# Impute common numeric fields by median
for col in numeric_cols:
    if df[col].isna().any():
        df[col] = df[col].fillna(df[col].median())

# Robust categorical mappings (no NaNs from mapping)
# FIRST_TIME_HB_FLAG: map to 0/1; default to 0 when unknown
if 'FIRST_TIME_HB_FLAG' in df.columns:
    df['FIRST_TIME_HB_FLAG'] = df['FIRST_TIME_HB_FLAG'].map({'Y':1, 'N':0}).fillna(0).astype(int)

# AMORTIZATION_TYPE: F (Fixed)=1, A (ARM)=0; default to 1 if unknown
if 'AMORTIZATION_TYPE' in df.columns:
    df['AMORTIZATION_TYPE'] = df['AMORTIZATION_TYPE'].map({'F':1, 'A':0}).fillna(1).astype(int)

# Fill remaining categorical NaNs with mode (defensive)
cat_cols = [c for c in [
    'LOAN_PURPOSE','PROPERTY_TYPE','OCCUPANCY_STATUS','PROPERTY_STATE',
    'CHANNEL'
] if c in df.columns]

for c in cat_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].mode().iloc[0])

# Final missingness check (pre-proxy, pre-encoding)
df.isna().sum().sort_values(ascending=False).head(10)


LOAN_ID              0
ORIGINATION_DATE     0
default_12m          0
CHANNEL              0
AMORTIZATION_TYPE    0
PROPERTY_STATE       0
OCCUPANCY_STATUS     0
NUM_UNITS            0
PROPERTY_TYPE        0
LOAN_PURPOSE         0
dtype: int64

#### Proxy Label on RAW Values

In [5]:
# Create proxy label using real-world thresholds on RAW (unscaled) values
required = ['BORR_CREDIT_SCORE','DTI','ORIG_LTV']
missing_req = [c for c in required if c not in df.columns]
if missing_req:
    raise ValueError(f"Missing required columns for proxy label: {missing_req}")

df['high_risk_proxy'] = (
    (df['BORR_CREDIT_SCORE'] < 660) |
    (df['DTI'] > 45) |
    (df['ORIG_LTV'] > 90)
).astype(int)

print("Proxy counts:")
print(df['high_risk_proxy'].value_counts())
print("Proxy rate:", df['high_risk_proxy'].mean())
assert df['high_risk_proxy'].nunique() == 2, "Proxy is single-class; check thresholds."


Proxy counts:
high_risk_proxy
0    7920
1    3276
Name: count, dtype: int64
Proxy rate: 0.29260450160771706


#### Save RAW

In [6]:
raw_cols_order = df.columns.tolist()  
df_raw_clean = df[raw_cols_order].copy()

raw_out = os.path.join(OUTPUT_DIR, "fannie_q1_2022_raw_clean.csv")
df_raw_clean.to_csv(raw_out, index=False)
print("Saved RAW-CLEAN:", raw_out)


Saved RAW-CLEAN: /kaggle/working/fannie_q1_2022_raw_clean.csv


#### Build model ready: Encode + Scale(no dates, no ID's)

In [7]:
# Drop date strings from modeling set
df_model = df.drop(columns=date_cols, errors='ignore').copy()

# Drop ID columns from features 
id_cols = [c for c in ['LOAN_ID'] if c in df_model.columns]
# Target name
TARGET = 'high_risk_proxy'

# Split feature groups
y = df_model[TARGET].astype(int)
X = df_model.drop(columns=[TARGET] + id_cols, errors='ignore')

# Identify categorical columns for one-hot
ohe_cols = [c for c in [
    'LOAN_PURPOSE','PROPERTY_TYPE','OCCUPANCY_STATUS','PROPERTY_STATE','CHANNEL',
    # engineered buckets could be added if you created them upstream
] if c in X.columns]

num_cols = [c for c in X.columns if c not in ohe_cols]

# One-hot encode categoricals, drop first to avoid multicollinearity
X_ohe = pd.get_dummies(X, columns=ohe_cols, drop_first=True)

# Scale numeric columns only
scaler = StandardScaler()
num_cols_in_ohe = [c for c in num_cols if c in X_ohe.columns]

X_ohe[num_cols_in_ohe] = scaler.fit_transform(X_ohe[num_cols_in_ohe])

# Reassemble final modeling frame
df_model_ready = pd.concat([X_ohe, y], axis=1)

# Final sanity checks
assert not df_model_ready.isna().any().any(), "NaNs remain in model-ready output"
assert TARGET in df_model_ready.columns, "Target missing from model-ready"
print("Model-ready shape:", df_model_ready.shape)


Model-ready shape: (11196, 76)


#### Save Model-Ready Artifact

In [8]:
model_out = os.path.join(OUTPUT_DIR, "fannie_q1_2022_model_ready.csv")
df_model_ready.to_csv(model_out, index=False)
print("Saved MODEL-READY:", model_out)


Saved MODEL-READY: /kaggle/working/fannie_q1_2022_model_ready.csv
