In [50]:
# Imports and configuration
import pandas as pd
from src.utils.file_utils import load_config, get_path
from src.utils.schema_utils import get_columns_by_attribute
from src.helpers.clean_helpers import quick_null_like_check

cfg_clean = load_config("clean")
cfg_schema = load_config("schema")

In [53]:
# Read in raw data file
input_file_path = get_path("hmda_raw")
raw_hmda_df = pd.read_parquet(input_file_path)
print(raw_hmda_df.shape)

(12236879, 85)


In [54]:
# Drop columns that are not known prior to application except for target
features_to_drop = get_columns_by_attribute(cfg_schema, "role", "drop")
raw_hmda_df.drop(columns=features_to_drop, inplace=True)
print(raw_hmda_df.shape)

(12236879, 67)


In [55]:
# Strip leading and trailing whitespace
# All columns are currently string values, so we will perform this on all columns regardless of eventual datatype
raw_hmda_df = raw_hmda_df.apply(lambda s: s.astype(str).str.strip())

In [61]:
# Check for null-like values. Using a 1% sample to speed up the check
NULL_LIKE = cfg_clean["clean"]["null_like"]
null_candidates = quick_null_like_check(raw_hmda_df, NULL_LIKE)
null_candidates

applicant_ethnicity_5           1.000000
co_applicant_ethnicity_5        1.000000
co_applicant_ethnicity_4        0.999984
applicant_ethnicity_4           0.999967
co_applicant_race_5             0.999926
co_applicant_race_4             0.999796
applicant_race_5                0.999788
co_applicant_ethnicity_3        0.999616
applicant_race_4                0.999469
applicant_ethnicity_3           0.998856
co_applicant_race_3             0.998529
applicant_race_3                0.996184
co_applicant_race_2             0.980763
co_applicant_ethnicity_2        0.976489
multifamily_affordable_units    0.974528
applicant_race_2                0.949129
prepayment_penalty_term         0.945550
applicant_ethnicity_2           0.936602
intro_rate_period               0.791295
co_applicant_age_above_62       0.639476
debt_to_income_ratio            0.331587
combined_loan_to_value_ratio    0.329062
property_value                  0.199307
income                          0.146900
applicant_age_ab

In [36]:
# Replace null-like values with standard value
raw_hmda_df[null_candidates.keys()] = (
    raw_hmda_df[null_candidates.keys()]
    .apply(lambda s: s.str.strip().str.lower().replace(list(NULL_LIKE), pd.NA))
)

print("Completed replacement of null-like values")

Completed replacement of null-like values


In [62]:
# Check for columns that contain only null values
fully_null_cols = raw_hmda_df.columns[raw_hmda_df.isna().all()]
print("Columns with only null values: ", fully_null_cols.tolist())

Columns with only null values:  []
