In [12]:
# Imports and configuration
%load_ext autoreload
%autoreload 2

import src.utils.file_utils as fu
import src.helpers.feature_engineering_helper as feh
import pandas as pd

cfg_schema = fu.load_config("schema")
cfg_feature_engineering = fu.load_config("feature_engineering")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Read in interim typed data file
typed_hmda_data = fu.load_parquet("hmda_2024_typed")
print(typed_hmda_data.shape)

Loading dataset from /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/interim/hmda_2024_typed.parquet
(8841112, 88)


In [3]:
# Generate multi-hot features for race/ethnicity features
feh.generate_multi_hot_features(typed_hmda_data, cfg_feature_engineering, "applicant_ethnicity_")
feh.generate_multi_hot_features(typed_hmda_data, cfg_feature_engineering, "co_applicant_ethnicity_")
feh.generate_multi_hot_features(typed_hmda_data, cfg_feature_engineering, "applicant_race_")
feh.generate_multi_hot_features(typed_hmda_data, cfg_feature_engineering, "co_applicant_race_")
print("Converted multi-hot columns for race/ethnicity")

Converted multi-hot columns for race/ethnicity


In [4]:
# Convert multifamily_affordable_units to binary because well over 99% are NA/0
typed_hmda_data["multifamily_affordable_units"] = (
    typed_hmda_data["multifamily_affordable_units"]
        .fillna(0)
        .gt(0)
        .astype("boolean[pyarrow]")
)

In [5]:
# Add binary flag for missing income values
typed_hmda_data["income_missing"] = typed_hmda_data["income"].isna().astype("boolean[pyarrow]")

In [6]:
# Impute values for missing incomes stratified by loan_type using median income to loan amount ratio
typed_hmda_data = feh.impute_income(typed_hmda_data)

In [7]:
# Add binary flag for missing property_value values
typed_hmda_data["property_value_missing"] = typed_hmda_data["property_value"].isna().astype("boolean[pyarrow]")

In [8]:
# Impute values for missing property_values using median loan to property value
typed_hmda_data = feh.impute_property_value(typed_hmda_data)

In [9]:
# Drop unneeded columns
columns_to_drop = [
    "applicant_ethnicity_1", "applicant_ethnicity_2", "applicant_ethnicity_3", "applicant_ethnicity_4", "applicant_ethnicity_5",
    "co_applicant_ethnicity_1", "co_applicant_ethnicity_2", "co_applicant_ethnicity_3", "co_applicant_ethnicity_4", "co_applicant_ethnicity_5",
    "applicant_race_1", "applicant_race_2", "applicant_race_3", "applicant_race_4", "applicant_race_5",
    "co_applicant_race_1", "co_applicant_race_2", "co_applicant_race_3", "co_applicant_race_4", "co_applicant_race_5",
    "action_taken", "census_tract", "county_code", "activity_year"
]
typed_hmda_data.drop(columns=columns_to_drop, inplace=True)

In [10]:
# Output data set to use in modeling
fu.save_parquet(typed_hmda_data, "hmda_2024_model")

Saved to /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/hmda_2024_model.parquet


PosixPath('/Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/hmda_2024_model.parquet')

In [11]:
# Create indexes for train/test split
feh.create_train_test_splits(typed_hmda_data)

Train indices saved to: /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/train_index.csv/train_index.csv
Test indices saved to:  /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/test_index.csv/test_index.csv


In [13]:
# Load modeling dataset and the split indices
modeling_dataset = fu.load_parquet("hmda_2024_model")
train_output_path = fu.get_path("train_index")
test_output_path = fu.get_path("test_index")
train_idx = pd.read_csv(train_output_path)["index"]
test_idx  = pd.read_csv(test_output_path)["index"]

# Subset the DataFrame
train_df = modeling_dataset.loc[train_idx]
test_df  = modeling_dataset.loc[test_idx]

# Compute class proportions
target = "denied_flag"

print("Train class proportions:")
print(train_df[target].value_counts(normalize=True).round(4))

print("\nTest class proportions:")
print(test_df[target].value_counts(normalize=True).round(4))

print("\nOverall dataset proportions:")
print(modeling_dataset[target].value_counts(normalize=True).round(4))

Loading dataset from /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/hmda_2024_model.parquet
Train class proportions:
denied_flag
False    0.757
True     0.243
Name: proportion, dtype: double[pyarrow]

Test class proportions:
denied_flag
False    0.7571
True     0.2429
Name: proportion, dtype: double[pyarrow]

Overall dataset proportions:
denied_flag
False    0.7571
True     0.2429
Name: proportion, dtype: double[pyarrow]
