In [13]:
# Imports and configuration
%load_ext autoreload
%autoreload 2

import src.utils.file_utils as fu
import src.helpers.feature_engineering_helper as feh
import pandas as pd
import numpy as np

cfg_schema = fu.load_config("schema")
cfg_feature_engineering = fu.load_config("feature_engineering")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
# Read in interim typed data file
typed_hmda_data = fu.load_parquet("hmda_2024_typed")
print(typed_hmda_data.shape)

Loading dataset from /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/interim/hmda_2024_typed.parquet
(8841112, 89)


In [15]:
# Add flag for missing income values
typed_hmda_data["income_missing"] = typed_hmda_data["income"].isna().astype("int8[pyarrow]").fillna(-1)

In [16]:
# Impute values for missing incomes stratified by loan_type using median income to loan amount ratio
typed_hmda_data = feh.impute_income(typed_hmda_data)

In [17]:
# Add flag for missing property_value values
typed_hmda_data["property_value_missing"] = typed_hmda_data["property_value"].isna().astype("int8[pyarrow]").fillna(-1)

In [18]:
# Impute values for missing property_values using median loan to property value
typed_hmda_data = feh.impute_property_value(typed_hmda_data)

In [19]:
# Add loan to income ratio feature
typed_hmda_data["loan_to_income_ratio"] = np.where(
    typed_hmda_data["income"] > 0,
    typed_hmda_data["loan_amount"] / typed_hmda_data["income"],
    np.nan
)

In [20]:
# Drop unneeded columns
columns_to_drop = [
    "applicant_ethnicity_1", "applicant_ethnicity_2", "applicant_ethnicity_3", "applicant_ethnicity_4", "applicant_ethnicity_5",
    "co_applicant_ethnicity_1", "co_applicant_ethnicity_2", "co_applicant_ethnicity_3", "co_applicant_ethnicity_4", "co_applicant_ethnicity_5",
    "applicant_race_1", "applicant_race_2", "applicant_race_3", "applicant_race_4", "applicant_race_5",
    "co_applicant_race_1", "co_applicant_race_2", "co_applicant_race_3", "co_applicant_race_4", "co_applicant_race_5",
    "action_taken", "census_tract", "county_code", "activity_year", "lei",
    # dropped for extremely low correlation with target
    "state_code", "multifamily_affordable_units", "multifamily_affordable_units_exempt", "applicant_age", "applicant_age_above_62", "balloon_payment", "total_units"
]
typed_hmda_data.drop(columns=columns_to_drop, inplace=True)

In [21]:
# For true numerics, fill missing values with median
num_cols = ["combined_loan_to_value_ratio", "loan_term", "intro_rate_period", "prepayment_penalty_term", "loan_to_income_ratio"]
typed_hmda_data[num_cols] = typed_hmda_data[num_cols].fillna(typed_hmda_data[num_cols].median())

In [22]:
# Output data set to use in modeling
fu.save_parquet(typed_hmda_data, "hmda_2024_model_catboost")

Saved to /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/hmda_2024_model_catboost.parquet


PosixPath('/Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/hmda_2024_model_catboost.parquet')

In [23]:
# Create indexes for train/test split
feh.create_train_test_splits(typed_hmda_data, index_suffix="_catboost")

Train indices saved to: /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/train_index_catboost.csv/train_index.csv
Test indices saved to:  /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/test_index_catboost.csv/test_index.csv


In [24]:
# Load modeling dataset and the split indices
modeling_dataset = fu.load_parquet("hmda_2024_model_catboost")
train_output_path = fu.get_path("train_index_catboost")
test_output_path = fu.get_path("test_index_catboost")
train_idx = pd.read_csv(train_output_path)["index"]
test_idx  = pd.read_csv(test_output_path)["index"]

# Subset the DataFrame
train_df = modeling_dataset.loc[train_idx]
test_df  = modeling_dataset.loc[test_idx]

# Compute class proportions
target = "denied_flag"

print("Train class proportions:")
print(train_df[target].value_counts(normalize=True).round(4))

print("\nTest class proportions:")
print(test_df[target].value_counts(normalize=True).round(4))

print("\nOverall dataset proportions:")
print(modeling_dataset[target].value_counts(normalize=True).round(4))

Loading dataset from /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/hmda_2024_model_catboost.parquet
Train class proportions:
denied_flag
0    0.757
1    0.243
Name: proportion, dtype: double[pyarrow]

Test class proportions:
denied_flag
0    0.7571
1    0.2429
Name: proportion, dtype: double[pyarrow]

Overall dataset proportions:
denied_flag
0    0.7571
1    0.2429
Name: proportion, dtype: double[pyarrow]
