In [13]:
import pandas as pd
import numpy as np
import s3fs
import boto3
import io
from census import Census
from us import states

In [19]:
fs = s3fs.S3FileSystem(profile='AdministratorAccess-769392325318')  # or None for default profile

parquet_files = fs.glob("mortgage-data-refined/mke_data/*.parquet")

dfs = [pd.read_parquet(f"s3://{file}", engine='pyarrow', filesystem=fs) for file in parquet_files]

df = pd.concat(dfs, ignore_index=False)
    

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 139293 entries, 0 to 15958
Data columns (total 100 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   activity_year                             139293 non-null  int64  
 1   lei                                       139293 non-null  object 
 2   derived_msa_md                            139293 non-null  int64  
 3   state_code                                139293 non-null  object 
 4   county_code                               139173 non-null  float64
 5   census_tract                              139293 non-null  int64  
 6   conforming_loan_limit                     139293 non-null  object 
 7   derived_loan_product_type                 139293 non-null  object 
 8   derived_dwelling_category                 139293 non-null  object 
 9   derived_ethnicity                         139293 non-null  object 
 10  derived_race             

In [67]:
df.tract_to_msa_income_percentage.unique().tolist()

[119.0,
 98.0,
 63.0,
 75.0,
 72.0,
 107.0,
 77.0,
 99.0,
 96.0,
 54.0,
 93.0,
 65.0,
 44.0,
 86.0,
 94.0,
 85.0,
 41.0,
 80.0,
 59.0,
 92.0,
 90.0,
 101.0,
 56.0,
 109.0,
 52.0,
 110.0,
 106.0,
 114.0,
 186.0,
 97.0,
 67.0,
 53.0,
 46.0,
 58.0,
 47.0,
 73.0,
 39.0,
 50.0,
 37.0,
 140.0,
 66.0,
 43.0,
 30.0,
 48.0,
 34.0,
 84.0,
 88.0,
 69.0,
 105.0,
 129.0,
 45.0,
 33.0,
 35.0,
 91.0,
 81.0,
 64.0,
 51.0,
 175.0,
 115.0,
 61.0,
 128.0,
 36.0,
 32.0,
 111.0,
 24.0,
 108.0,
 71.0,
 196.0,
 70.0,
 79.0,
 29.0,
 28.0,
 165.0,
 204.0,
 42.0,
 38.0,
 49.0,
 104.0,
 74.0,
 31.0,
 82.0,
 40.0,
 78.0,
 27.0,
 143.0,
 68.0,
 22.0,
 23.0,
 12.0,
 100.0,
 124.0,
 57.0,
 169.0,
 103.0,
 112.0,
 102.0,
 83.0,
 26.0,
 89.0,
 150.0,
 139.0,
 121.0,
 156.0,
 240.0,
 95.0,
 0.0,
 87.0,
 62.0,
 177.0,
 133.0,
 161.0,
 122.0,
 125.0,
 18.0,
 49.71,
 44.38,
 48.22,
 87.73,
 74.83,
 94.72,
 51.57,
 31.5,
 71.46,
 44.04,
 31.41,
 48.27,
 37.69,
 44.43,
 35.62,
 40.4,
 31.79,
 49.6,
 22.07,
 29.79,
 37.56,
 

In [69]:
float_to_int_cols = ['county_code', 'loan_amount', 'income', 'applicant_ethnicity_1', 'applicant_ethnicity_2', 
                    'applicant_ethnicity_3', 'applicant_ethnicity_4', 'applicant_ethnicity_5']
str_to_float_cols = ['combined_loan_to_value_ratio', 'interest_rate', 'rate_spread', 'total_loan_costs', 
                    'total_points_and_fees', 'origination_charges', 'discount_points', 
                    'lender_credits']
str_to_int_cols = ['loan_term', 'property_value']
str_to_int_exempt = ['intro_rate_period', 'prepayment_penalty_term', 'multifamily_affordable_units']
categorical_cols = ['conforming_loan_limit', 'applicant_age', 'co_applicant_age', 'debt_to_income_ratio']
convert_na_to_nan = ['purchaser_type', 'loan_purpose']
yes_no_bool_conversion = ['applicant_age_above_62', 'co_applicant_age_above_62']
convert_exempt_to_nan = ['reverse_mortgage', 'open_end_line_of_credit', 'business_or_commercial_purpose', 
                        'negative_amortization', 'interest_only_payment', 'balloon_payment', 
                        'other_nonamortizing_features', 'manufactured_home_secured_property_type', 
                        'manufactured_home_land_property_interest']
other = ['derived_dwelling_category', 'action_taken', 'hoepa_status', 'total_units', 
        'co_applicant_credit_score_type', 'co_applicant_ethnicity_1', 'co_applicant_ethnicity_2', 
        'co_applicant_ethnicity_3', 'co_applicant_ethnicity_4', 'co_applicant_ethnicity_5', 
        'applicant_ethnicity_observed', 'co_applicant_ethnicity_observed', 'applicant_race_1', 
        'applicant_race_2', 'applicant_race_3', 'applicant_race_4', 'applicant_race_5', 
        'co_applicant_race_1', 'co_applicant_race_2', 'co_applicant_race_3', 'co_applicant_race_4', 
        'co_applicant_race_5', 'applicant_race_observed', 'co_applicant_race_observed', 
        'applicant_sex', 'co_applicant_sex', 'applicant_sex_observed', 'co_applicant_sex_observed', 
        'submission_of_application', 'initially_payable_to_institution', 'aus_1', 'aus_2', 
        'aus_3', 'aus_4', 'aus_5', 'denial_reason_1', 'denial_reason_2', 'denial_reason_3', 
        'denial_reason_4', 'applicant_credit_score_type']

In [70]:
def send_dataframe_to_s3_csv(df, bucket, key, profile=None):
    # Set up AWS session
    session = boto3.Session(profile_name=profile) if profile else boto3.Session()
    s3 = session.client('s3')

    # Create in-memory CSV buffer
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)

    # Upload to S3
    s3.put_object(Bucket=bucket, Key=key, Body=csv_buffer.getvalue())
    print(f"✅ Uploaded to s3://{bucket}/{key}")


In [71]:
# Start by uploading this dataframe to S3
file_key = "mke_data/combined_hmda_2018_to_2023.csv"
bucket = 'mortgage-data-refined'
send_dataframe_to_s3_csv(df, bucket, file_key, profile='AdministratorAccess-769392325318')

✅ Uploaded to s3://mortgage-data-refined/mke_data/combined_hmda_2018_to_2023.csv


In [73]:
# Start cleaning the files - float to int first
for col in float_to_int_cols: 
    df[col] = df[col].astype('Int64')

In [74]:
test = df.copy()
# Now convert the string to floats
for col in str_to_float_cols: 
    test[col] = pd.to_numeric(test[col], errors='coerce')

test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 139293 entries, 0 to 15958
Data columns (total 100 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   activity_year                             139293 non-null  int64  
 1   lei                                       139293 non-null  object 
 2   derived_msa_md                            139293 non-null  int64  
 3   state_code                                139293 non-null  object 
 4   county_code                               139173 non-null  Int64  
 5   census_tract                              139293 non-null  int64  
 6   conforming_loan_limit                     139293 non-null  object 
 7   derived_loan_product_type                 139293 non-null  object 
 8   derived_dwelling_category                 139293 non-null  object 
 9   derived_ethnicity                         139293 non-null  object 
 10  derived_race             