In [11]:
# Imports and configuration
%load_ext autoreload
%autoreload 2

import pandas as pd
import src.utils.file_utils as fu
import src.utils.schema_utils as su
import src.helpers.clean_helpers as chelp

cfg_clean = fu.load_config("clean")
cfg_schema = fu.load_config("schema")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
# Read in raw data file
input_file_path = fu.get_path("hmda_raw")
raw_hmda_df = pd.read_parquet(input_file_path, dtype_backend="pyarrow")
print(raw_hmda_df.shape)

(12236879, 85)


In [13]:
# Drop columns that are not known prior to application except for target
features_to_drop = su.get_columns_by_attribute(cfg_schema, "role", "drop")
raw_hmda_df.drop(columns=features_to_drop, inplace=True)
print(raw_hmda_df.shape)

(12236879, 67)


In [14]:
# Strip leading and trailing whitespace
# All columns are currently string values, so we will perform this on all columns regardless of eventual datatype
chelp.strip_string_columns_inplace(raw_hmda_df)

In [15]:
# Check for null-like values. Using a 1% sample to speed up the check
NULL_LIKE = cfg_clean["clean"]["null_like"]
null_candidates = chelp.quick_null_like_check(raw_hmda_df, NULL_LIKE)
null_candidates

Series([], Name: null_like_fraction, dtype: object)

In [16]:
# Check for columns that contain only null values
fully_null_cols = raw_hmda_df.columns[raw_hmda_df.isna().all()]
print("Columns with only null values: ", fully_null_cols.tolist())

Columns with only null values:  []


In [19]:
# Some columns have conflated both an "Exempt" flag and the feature value.  We need to identify these columns and separate the exempt flag into its own column

# Find the features using exempt
features = set(su.get_columns_by_attribute(cfg_schema, "role", "feature"))
exempt  = set(su.get_columns_by_attribute(cfg_schema, "exempt", True))
exempt_feature_cols = sorted(features & exempt) # We need the intersection so we exclude dropped columns
print("Features using exempt: ", exempt_feature_cols)

# Apply the transformation
created_flags = chelp.apply_exempt_split(raw_hmda_df, exempt_feature_cols)
print("Created: ", created_flags)

Features using exempt:  ['combined_loan_to_value_ratio', 'county_code', 'debt_to_income_ratio', 'intro_rate_period', 'loan_term', 'multifamily_affordable_units', 'prepayment_penalty_term', 'property_value']
Created:  ['combined_loan_to_value_ratio_exempt', 'county_code_exempt', 'debt_to_income_ratio_exempt', 'intro_rate_period_exempt', 'loan_term_exempt', 'multifamily_affordable_units_exempt', 'prepayment_penalty_term_exempt', 'property_value_exempt']


In [21]:
# We have one exception case with income to clean up before conversion
raw_hmda_df["income"] = raw_hmda_df["income"].replace("999999999", pd.NA)

# Convert columns to correct data types
raw_hmda_df = chelp.convert_by_schema(raw_hmda_df, cfg_schema)
print(raw_hmda_df.dtypes)

activity_year                                    Int16
lei                                    string[pyarrow]
loan_type                                        Int16
loan_purpose                                     Int16
preapproval                                      Int16
                                            ...       
intro_rate_period_exempt                 bool[pyarrow]
loan_term_exempt                         bool[pyarrow]
multifamily_affordable_units_exempt      bool[pyarrow]
prepayment_penalty_term_exempt           bool[pyarrow]
property_value_exempt                    bool[pyarrow]
Length: 75, dtype: object


In [24]:
# Identify which features are categorical
categorical = set(su.get_columns_by_attribute(cfg_schema, "type", "categorical"))
categorical_feature_cols = sorted(features & categorical) # We need the intersection so we exclude dropped columns
print("Categorical features: ", categorical_feature_cols)

Categorical features:  ['action_taken', 'applicant_age', 'applicant_age_above_62', 'applicant_credit_scoring_model', 'applicant_ethnicity_1', 'applicant_ethnicity_2', 'applicant_ethnicity_3', 'applicant_ethnicity_4', 'applicant_ethnicity_5', 'applicant_ethnicity_observed', 'applicant_race_1', 'applicant_race_2', 'applicant_race_3', 'applicant_race_4', 'applicant_race_5', 'applicant_race_observed', 'applicant_sex', 'applicant_sex_observed', 'balloon_payment', 'business_or_commercial_purpose', 'co_applicant_age', 'co_applicant_age_above_62', 'co_applicant_credit_scoring_model', 'co_applicant_ethnicity_1', 'co_applicant_ethnicity_2', 'co_applicant_ethnicity_3', 'co_applicant_ethnicity_4', 'co_applicant_ethnicity_5', 'co_applicant_ethnicity_observed', 'co_applicant_race_1', 'co_applicant_race_2', 'co_applicant_race_3', 'co_applicant_race_4', 'co_applicant_race_5', 'co_applicant_race_observed', 'co_applicant_sex', 'co_applicant_sex_observed', 'combined_loan_to_value_ratio', 'construction_me

In [25]:
# Convert columns to categorical
chelp.to_pandas_categoricals(raw_hmda_df, categorical_feature_cols)

In [43]:
# Checking for illogical or extreme numerical values
print("Negative/zero loan amounts: ", raw_hmda_df.query("loan_amount <= 0").shape[0])
print("Negative/zero property values: ", raw_hmda_df.query("property_value <= 0").shape[0])
print("Negative incomes: ", raw_hmda_df.query("income < 0").shape[0])

Negative/zero loan amounts:  0
Negative/zero property values:  0
Negative incomes:  7757


In [47]:
# Negative incomes appear to be typos, so correcting values to positive
mask = raw_hmda_df["income"] < 0
raw_hmda_df.loc[mask, "income"] = raw_hmda_df.loc[mask, "income"].abs()
print("Negative incomes: ", raw_hmda_df.query("income < 0").shape[0])

Negative incomes:  0
