In [None]:
import pandas as pd

# Load the MSA (Metropolitan Statistical Area) data by ZIP code from a CSV file
df_msa = pd.read_csv("../data/msa-by-zip.csv")

# Filter rows to keep only ZIP codes greater than or equal to 601
# (likely removing invalid or non-continental ZIP codes)
df_msa = df_msa[df_msa["ZIP CODE"] >= 601]

# Convert ZIP codes to string type and pad with leading zeros to ensure 5-digit format
df_msa["ZIP CODE"] = df_msa["ZIP CODE"].astype(str).str.zfill(5)

# Standardize column names: lowercase and replace spaces with underscores
df_msa.columns = [col.lower().replace(" ", "_") for col in df_msa.columns]

# Create a subset dataframe with only the relevant columns: ZIP code and MSA name
df_subset = df_msa[["zip_code", "msa_name"]]

# Display the first few rows of the processed dataframe
df_msa.head()


Unnamed: 0,zip_code,state,msa_no.,gpci,gpci.1,gpci.2,county_no.,msa_name,unnamed:_8
8,601,PR,99072.0,1.0,0.845,0.249,1.0,PR NONMETROPOLITAN AREA,
9,602,PR,10380.0,1.0,0.845,0.249,3.0,"Aguadilla-Isabela-San Sebastian, PR MSA",
10,603,PR,10380.0,1.0,0.845,0.249,5.0,"Aguadilla-Isabela-San Sebastian, PR MSA",
11,604,PR,10380.0,1.0,0.845,0.249,5.0,"Aguadilla-Isabela-San Sebastian, PR MSA",
12,605,PR,10380.0,1.0,0.845,0.249,5.0,"Aguadilla-Isabela-San Sebastian, PR MSA",


In [2]:
def clean_rent_data(df, year):    
    #Create new column for region name
    df["zip_code"] = df["zip_code"].astype(str).str.zfill(5)
    df["msa_name"] = df["zip_code"].map(df_subset.set_index("zip_code")["msa_name"])

    #Drop NONMETROPOLITAN Rows
    df= df[~df['msa_name'].str.contains('NONMETROPOLITAN', na=False)]
    df = df.dropna(subset=['msa_name'])

    #Standardize Column Names
    df.columns = [col.lower().replace(" ", "_") for col in df.columns]

    #Reset Index of Dataframe
    df = df.reset_index(drop = True)

    #Convert rent col to numeric
    df["median_contract_rent"] = pd.to_numeric(df["median_contract_rent"], errors = "coerce")

    #Create new col for region rent
    df["region_contract_rent_median"] = df.groupby("msa_name")["median_contract_rent"].transform("median")

    #Add year
    df["year"] = year

    return df

In [None]:
import requests
import pandas as pd

API_KEY = "c43997742a8d45e194e874dbffe3766cb7b41101" 

all_year = []

for year in range (2011, 2024):
    print(f"Starting Year: {year}")
    url = f"https://api.census.gov/data/{year}/acs/acs5?get=NAME,B25056_001E&for=zip%20code%20tabulation%20area:*&key={API_KEY}"
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    cols = data[0]
    rows = data[1:]

    df = pd.DataFrame(rows, columns = cols)

    df = df.rename(columns = {"B25056_001E" : "median_contract_rent",
                          "zip code tabulation area" : "zip_code"})
    
    df = clean_rent_data(df, year)

    all_year.append(df)

rent_2011_2023 = pd.concat(all_year, ignore_index = True)
rent_2011_2023.head()
rent_2011_2023.to_csv('../data/rent_data_raw.csv', index=False)

Starting Year: 2011
Starting Year: 2012
Starting Year: 2013
Starting Year: 2014
Starting Year: 2015
Starting Year: 2016
Starting Year: 2017
Starting Year: 2018
Starting Year: 2019
Starting Year: 2020
Starting Year: 2021
Starting Year: 2022
Starting Year: 2023


In [None]:
# Loop over each year from 2012 to 2023 (inclusive)
for yr in range(2012, 2024):
    # Filter rent data to keep only rows from 2011 and the current year 'yr'
    df_yr_only = rent_2011_2023[rent_2011_2023["year"].isin([2011, yr])]
    
    # Create a pivot table for ZIP codes:
    # Rows = zip_code, Columns = year, Values = median_contract_rent
    zip_table = pd.pivot_table(
        df_yr_only, 
        values="median_contract_rent", 
        index="zip_code", 
        columns="year"
    )
    
    # Calculate rent change for each ZIP code between 2011 and the current year 'yr'
    zip_table[f"rent_change_2011-{yr}_zip"] = zip_table[yr] - zip_table[2011]
    
    # Create a pivot table for MSA regions:
    # Rows = msa_name, Columns = year, Values = median contract rent median (regional median rent)
    region_table = pd.pivot_table(
        df_yr_only, 
        values="region_contract_rent_median", 
        index="msa_name", 
        columns="year"
    )
    
    # Calculate rent change for each MSA region between 2011 and the current year 'yr'
    region_table[f"rent_change_2011-{yr}_region"] = region_table[yr] - region_table[2011]
    
    # Merge the ZIP-level rent change column back into the main rent dataframe on 'zip_code'
    rent_2011_2023 = rent_2011_2023.merge(
        zip_table[[f"rent_change_2011-{yr}_zip"]], 
        on="zip_code", 
        how="left"
    )
    
    # Merge the region-level rent change column back into the main rent dataframe on 'msa_name'
    rent_2011_2023 = rent_2011_2023.merge(
        region_table[[f"rent_change_2011-{yr}_region"]], 
        on="msa_name", 
        how="left"
    )
    
    # Create a boolean indicator column to flag ZIP codes where rent increased faster than its region
    # True means ZIP-level rent change > regional rent change, marking it as a "hot" area
    rent_2011_2023[f"hot_2011_{yr}"] = (
        rent_2011_2023[f"rent_change_2011-{yr}_zip"] > rent_2011_2023[f"rent_change_2011-{yr}_region"]
    )


In [5]:
rent_2011_2023 = rent_2011_2023.drop(["state", "name"], axis = 1)
rent_2011_2023["median_contract_rent"] = rent_2011_2023["median_contract_rent"].astype(float)

In [6]:
rent_2011_2023.to_csv("contract_rent_2011-2023.csv")