In [2]:
import pandas as pd
import numpy as np
import s3fs
import boto3
import io

In [5]:
def read_parquet_to_pandas(bucket, prefix, profile=None): 
    fs = s3fs.S3FileSystem(profile=profile)
    parquet_files = fs.ls(f"{bucket}/{prefix}")
    
    session = boto3.Session(profile_name=profile) if profile else boto3.Session()
    s3 = session.client('s3')
    
    all_chunks = []

    parquet_files = ['s3://mortgage-data-clean/hmda/2022/public_lar2023_public_lar_csv_part0_part0.parquet', 
                     's3://mortgage-data-clean/hmda/2022/public_lar2023_public_lar_csv_part1_part0.parquet', 
                     's3://mortgage-data-clean/hmda/2022/public_lar2023_public_lar_csv_part10_part0.parquet', 
                     's3://mortgage-data-clean/hmda/2022/public_lar2023_public_lar_csv_part11_part0.parquet', 
                     's3://mortgage-data-clean/hmda/2022/public_lar2023_public_lar_csv_part12_part0.parquet']

    for pq_file in parquet_files[:5]: 
        print(f"🔍 Processing: s3://{pq_file}")

        with fs.open(pq_file, 'rb') as f: 
            df = pd.read_parquet(f, engine='pyarrow')
            all_chunks.append(df)
    
    return pd.concat(all_chunks, ignore_index=True)

In [6]:
df = read_parquet_to_pandas('mortgage-data-clean', 'hmda/2022', 'AdministratorAccess-769392325318')

🔍 Processing: s3://s3://mortgage-data-clean/hmda/2022/public_lar2023_public_lar_csv_part0_part0.parquet
🔍 Processing: s3://s3://mortgage-data-clean/hmda/2022/public_lar2023_public_lar_csv_part1_part0.parquet
🔍 Processing: s3://s3://mortgage-data-clean/hmda/2022/public_lar2023_public_lar_csv_part10_part0.parquet
🔍 Processing: s3://s3://mortgage-data-clean/hmda/2022/public_lar2023_public_lar_csv_part11_part0.parquet
🔍 Processing: s3://s3://mortgage-data-clean/hmda/2022/public_lar2023_public_lar_csv_part12_part0.parquet


In [8]:
df.head()

Unnamed: 0,activity_year,lei,derived_msa_md,state_code,county_code,census_tract,conforming_loan_limit,derived_loan_product_type,derived_dwelling_category,derived_ethnicity,...,denial_reason_2,denial_reason_3,denial_reason_4,tract_population,tract_minority_population_percent,ffiec_msa_md_median_family_income,tract_to_msa_income_percentage,tract_owner_occupied_units,tract_one_to_four_family_homes,tract_median_age_of_housing_units
0,2023,549300JOT0D4J0SZIK67,29460,FL,12105.0,12105010000.0,C,VA:First Lien,Single Family (1-4 Units):Site-Built,Ethnicity Not Available,...,,,,2570,43.15,74300,111.11,441,716,25
1,2023,549300JOT0D4J0SZIK67,29820,NV,32003.0,32003000000.0,C,FHA:First Lien,Single Family (1-4 Units):Site-Built,Ethnicity Not Available,...,,,,7043,56.92,83900,109.65,1383,2057,15
2,2023,549300JOT0D4J0SZIK67,49420,WA,53077.0,53077000000.0,C,VA:First Lien,Single Family (1-4 Units):Site-Built,Ethnicity Not Available,...,,,,3044,19.02,76600,131.97,1031,1288,35
3,2023,549300JOT0D4J0SZIK67,28140,KS,20091.0,20091050000.0,C,FHA:First Lien,Single Family (1-4 Units):Site-Built,Ethnicity Not Available,...,,,,4079,10.17,104400,166.18,1240,1341,37
4,2023,549300JOT0D4J0SZIK67,28420,WA,53005.0,53005010000.0,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Ethnicity Not Available,...,,,,9675,26.74,101700,128.98,2474,2688,15


In [10]:
fs = s3fs.S3FileSystem(profile='AdministratorAccess-769392325318')  # or None for default profile

with fs.open('mortgage-data-refined/dycu_outputs/HMDA_2022_CTData.xls', 'rb') as f:
    target_df = pd.read_excel(f)
    target_df.head()

In [11]:
target_df.head()

Unnamed: 0,census_tract,TotalOriginations,WhiteNonHispanicOriginations,BlackOriginations,LatinxOriginations,AAPIOriginations,Applications,Denials,Fallout,OriginationRate,DenialRate,FalloutRate
0,55079000101,29,3,10,2,3,52,6,11,0.557692,0.115385,0.211538
1,55079187000,27,16,3,3,1,42,1,7,0.642857,0.02381,0.166667
2,55079018600,15,0,1,11,1,22,1,4,0.681818,0.045455,0.181818
3,55079003500,37,6,21,5,2,61,7,8,0.606557,0.114754,0.131148
4,55079017900,30,23,0,2,1,49,1,11,0.612245,0.020408,0.22449
