In [1]:
# Imports and configuration
%load_ext autoreload
%autoreload 2

import src.utils.file_utils as fu
import src.helpers.feature_engineering_helper as feh
import pandas as pd

cfg_schema = fu.load_config("schema")
cfg_feature_engineering = fu.load_config("feature_engineering")

In [2]:
# Read in interim typed data file
typed_hmda_data = fu.load_parquet("hmda_2024_typed")
print(typed_hmda_data.shape)

Loading dataset from /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/interim/hmda_2024_typed.parquet
(8841112, 89)


In [3]:
# Generate multi-hot features for race/ethnicity features
feh.generate_multi_hot_features(typed_hmda_data, cfg_feature_engineering, "applicant_ethnicity_")
feh.generate_multi_hot_features(typed_hmda_data, cfg_feature_engineering, "co_applicant_ethnicity_")
feh.generate_multi_hot_features(typed_hmda_data, cfg_feature_engineering, "applicant_race_")
feh.generate_multi_hot_features(typed_hmda_data, cfg_feature_engineering, "co_applicant_race_")
print("Converted multi-hot columns for race/ethnicity")

Converted multi-hot columns for race/ethnicity


In [4]:
# Convert multifamily_affordable_units NA to 0
typed_hmda_data["multifamily_affordable_units"] = typed_hmda_data["multifamily_affordable_units"].fillna(0)

In [5]:
# Add flag for missing income values
typed_hmda_data["income_missing"] = typed_hmda_data["income"].isna().astype("int8[pyarrow]").fillna(-1)

In [6]:
# Impute values for missing incomes stratified by loan_type using median income to loan amount ratio
typed_hmda_data = feh.impute_income(typed_hmda_data)

In [7]:
# Add flag for missing property_value values
typed_hmda_data["property_value_missing"] = typed_hmda_data["property_value"].isna().astype("int8[pyarrow]").fillna(-1)

In [8]:
# Impute values for missing property_values using median loan to property value
typed_hmda_data = feh.impute_property_value(typed_hmda_data)

In [9]:
# Drop unneeded columns
columns_to_drop = [
    "applicant_ethnicity_1", "applicant_ethnicity_2", "applicant_ethnicity_3", "applicant_ethnicity_4", "applicant_ethnicity_5",
    "co_applicant_ethnicity_1", "co_applicant_ethnicity_2", "co_applicant_ethnicity_3", "co_applicant_ethnicity_4", "co_applicant_ethnicity_5",
    "applicant_race_1", "applicant_race_2", "applicant_race_3", "applicant_race_4", "applicant_race_5",
    "co_applicant_race_1", "co_applicant_race_2", "co_applicant_race_3", "co_applicant_race_4", "co_applicant_race_5",
    "action_taken", "census_tract", "county_code", "activity_year", "lei",
    # dropped for extremely low correlation with target
    "state_code", "multifamily_affordable_units", "multifamily_affordable_units_exempt", "applicant_age", "applicant_age_above_62", "balloon_payment", "total_units"
]
typed_hmda_data.drop(columns=columns_to_drop, inplace=True)

In [11]:
# Fill string missing values with NA for one-hot encoding
typed_hmda_data[typed_hmda_data.select_dtypes(include=["string"]).columns] = typed_hmda_data.select_dtypes(include=["string"]).fillna("NA")

In [12]:
# For numerics that are categorical, fill missing values with -1
catg_num_cols = ["applicant_credit_scoring_model", "co_applicant_credit_scoring_model","manufactured_home_secured_property_type", "submission_of_application", "initially_payable_to_institution"]
typed_hmda_data[catg_num_cols] = typed_hmda_data[catg_num_cols].fillna(typed_hmda_data[catg_num_cols].median())

In [13]:
# For true numerics, fill missing values with median
num_cols = ["combined_loan_to_value_ratio", "loan_term", "intro_rate_period", "prepayment_penalty_term"]
typed_hmda_data[num_cols] = typed_hmda_data[num_cols].fillna(typed_hmda_data[num_cols].median())

In [15]:
# One-hot encoding for categorical features
typed_hmda_data = feh.one_hot_encode_columns(typed_hmda_data, cfg_feature_engineering)

In [16]:
# Output data set to use in modeling
fu.save_parquet(typed_hmda_data, "hmda_2024_model")

Saved to /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/hmda_2024_model.parquet


PosixPath('/Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/hmda_2024_model.parquet')

In [17]:
# Create indexes for train/test split
feh.create_train_test_splits(typed_hmda_data)

Train indices saved to: /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/train_index.csv/train_index.csv
Test indices saved to:  /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/test_index.csv/test_index.csv


In [18]:
# Load modeling dataset and the split indices
modeling_dataset = fu.load_parquet("hmda_2024_model")
train_output_path = fu.get_path("train_index")
test_output_path = fu.get_path("test_index")
train_idx = pd.read_csv(train_output_path)["index"]
test_idx  = pd.read_csv(test_output_path)["index"]

# Subset the DataFrame
train_df = modeling_dataset.loc[train_idx]
test_df  = modeling_dataset.loc[test_idx]

# Compute class proportions
target = "denied_flag"

print("Train class proportions:")
print(train_df[target].value_counts(normalize=True).round(4))

print("\nTest class proportions:")
print(test_df[target].value_counts(normalize=True).round(4))

print("\nOverall dataset proportions:")
print(modeling_dataset[target].value_counts(normalize=True).round(4))

Loading dataset from /Users/c1burns/Documents/UTD/BUAN 6341/project_repo/data/processed/hmda_2024_model.parquet
Train class proportions:
denied_flag
0    0.757
1    0.243
Name: proportion, dtype: double[pyarrow]

Test class proportions:
denied_flag
0    0.7571
1    0.2429
Name: proportion, dtype: double[pyarrow]

Overall dataset proportions:
denied_flag
0    0.7571
1    0.2429
Name: proportion, dtype: double[pyarrow]


In [19]:
test_df = typed_hmda_data.sample(frac=0.005, random_state=42)

In [20]:
import src.helpers.eda_helpers as eh
eh.identify_feature_target_correlations(test_df, test_df.columns, 'denied_flag')

Unnamed: 0,feature,cramers_v
27,denied_flag,0.999938
3,combined_loan_to_value_ratio,0.686217
1,income,0.310726
6,property_value,0.262163
0,loan_amount,0.258803
...,...,...
85,co_applicant_sex_observed_collected,0.005750
127,negative_amortization_True,0.005054
130,occupancy_type_secondary,0.003089
136,other_non_amortizing_features_True,0.001150
