In [13]:
# Imports and configuration
%load_ext autoreload
%autoreload 2

import src.utils.file_utils as fu
import src.helpers.feature_engineering_helper as feh

cfg_schema = fu.load_config("schema")
cfg_feature_engineering = fu.load_config("feature_engineering")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
# Read in interim typed data file
typed_hmda_data = fu.load_parquet("hmda_2024_typed")
print(typed_hmda_data.shape)

Loading dataset from /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/interim/hmda_2024_typed.parquet
(8841112, 88)


In [15]:
# Generate multi-hot features for race/ethnicity features
feh.generate_multi_hot_features(typed_hmda_data, cfg_feature_engineering, "applicant_ethnicity_")
feh.generate_multi_hot_features(typed_hmda_data, cfg_feature_engineering, "co_applicant_ethnicity_")
feh.generate_multi_hot_features(typed_hmda_data, cfg_feature_engineering, "applicant_race_")
feh.generate_multi_hot_features(typed_hmda_data, cfg_feature_engineering, "co_applicant_race_")
print("Converted multi-hot columns for race/ethnicity")

Converted multi-hot columns for race/ethnicity


In [16]:
# Convert multifamily_affordable_units to binary because well over 99% are NA/0
typed_hmda_data["multifamily_affordable_units"] = (
    typed_hmda_data["multifamily_affordable_units"]
        .fillna(0)
        .gt(0)
        .astype("boolean[pyarrow]")
)

In [17]:
# Add binary flag for missing income values
typed_hmda_data["income_missing"] = typed_hmda_data["income"].isna().astype("boolean[pyarrow]")

In [18]:
# Impute values for missing incomes stratified by loan_type using median income to loan amount ratio
typed_hmda_data = feh.impute_income(typed_hmda_data)

In [19]:
# Add binary flag for missing property_value values
typed_hmda_data["property_value_missing"] = typed_hmda_data["property_value"].isna().astype("boolean[pyarrow]")

In [20]:
# Impute values for missing property_values using median loan to property value
typed_hmda_data = feh.impute_property_value(typed_hmda_data)

In [21]:
# Drop unneeded columns
columns_to_drop = [
    "applicant_ethnicity_1", "applicant_ethnicity_2", "applicant_ethnicity_3", "applicant_ethnicity_4", "applicant_ethnicity_5",
    "co_applicant_ethnicity_1", "co_applicant_ethnicity_2", "co_applicant_ethnicity_3", "co_applicant_ethnicity_4", "co_applicant_ethnicity_5",
    "applicant_race_1", "applicant_race_2", "applicant_race_3", "applicant_race_4", "applicant_race_5",
    "co_applicant_race_1", "co_applicant_race_2", "co_applicant_race_3", "co_applicant_race_4", "co_applicant_race_5",
    "action_taken", "census_tract", "county_code", "activity_year"
]
typed_hmda_data.drop(columns=columns_to_drop, inplace=True)

In [22]:
# Output data set to use in modeling
fu.save_parquet(typed_hmda_data, "hmda_2024_model")

Saved to /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/hmda_2024_model.parquet


PosixPath('/Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/hmda_2024_model.parquet')

In [23]:
typed_hmda_data.dtypes

lei                                    string[pyarrow]
loan_type                               int16[pyarrow]
loan_purpose                            int16[pyarrow]
preapproval                              bool[pyarrow]
construction_method                      bool[pyarrow]
                                            ...       
co_applicant_race_info_not_provided      bool[pyarrow]
co_applicant_race_not_applicable         bool[pyarrow]
co_applicant_race_no_co_applicant        bool[pyarrow]
income_missing                           bool[pyarrow]
property_value_missing                   bool[pyarrow]
Length: 120, dtype: object