In [1]:
import pandas as pd

df_msa = pd.read_csv("data/msa-by-zip.csv")

df_msa = df_msa[df_msa["ZIP CODE"] >= 601]

df_msa["ZIP CODE"] = df_msa["ZIP CODE"].astype(str).str.zfill(5)

df_msa.columns = [col.lower().replace(" ", "_") for col in df_msa.columns]

df_subset = df_msa[["zip_code", "msa_name"]]

df_msa.head()

Unnamed: 0,zip_code,state,msa_no.,gpci,gpci.1,gpci.2,county_no.,msa_name,unnamed:_8
8,601,PR,99072.0,1.0,0.845,0.249,1.0,PR NONMETROPOLITAN AREA,
9,602,PR,10380.0,1.0,0.845,0.249,3.0,"Aguadilla-Isabela-San Sebastian, PR MSA",
10,603,PR,10380.0,1.0,0.845,0.249,5.0,"Aguadilla-Isabela-San Sebastian, PR MSA",
11,604,PR,10380.0,1.0,0.845,0.249,5.0,"Aguadilla-Isabela-San Sebastian, PR MSA",
12,605,PR,10380.0,1.0,0.845,0.249,5.0,"Aguadilla-Isabela-San Sebastian, PR MSA",


In [2]:
def clean_rent_data(df, year):    
    #Create new column for region name
    df["zip_code"] = df["zip_code"].astype(str).str.zfill(5)
    df["msa_name"] = df["zip_code"].map(df_subset.set_index("zip_code")["msa_name"])

    #Drop NONMETROPOLITAN Rows
    df= df[~df['msa_name'].str.contains('NONMETROPOLITAN', na=False)]
    df = df.dropna(subset=['msa_name'])

    #Standardize Column Names
    df.columns = [col.lower().replace(" ", "_") for col in df.columns]

    #Reset Index of Dataframe
    df = df.reset_index(drop = True)

    #Convert rent col to numeric
    df["median_contract_rent"] = pd.to_numeric(df["median_contract_rent"], errors = "coerce")

    #Create new col for region rent
    df["region_contract_rent_median"] = df.groupby("msa_name")["median_contract_rent"].transform("median")

    #Add year
    df["year"] = year

    return df

In [3]:
import requests
import pandas as pd

API_KEY = "c43997742a8d45e194e874dbffe3766cb7b41101" 

#Base - 2011 Data
url = f"https://api.census.gov/data/2011/acs/acs5?get=NAME,B25056_001E&for=zip%20code%20tabulation%20area:*&key={API_KEY}"
response = requests.get(url)
response.raise_for_status()
data = response.json()
cols = data[0]
rows = data[1:]

df_2011 = pd.DataFrame(rows, columns = cols)
df_2011 = df_2011.rename(columns = {"B25056_001E" : "median_contract_rent", "zip code tabulation area" : "zip_code"})
    
df_2011 = clean_rent_data(df_2011, 2011)

#Rest - 2012-2023

all_year = []

for year in range (2012, 2024):
    print(f"Starting Year: {year}")
    url = f"https://api.census.gov/data/{year}/acs/acs5?get=NAME,B25056_001E&for=zip%20code%20tabulation%20area:*&key={API_KEY}"
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    cols = data[0]
    rows = data[1:]

    df = pd.DataFrame(rows, columns = cols)

    df = df.rename(columns = {"B25056_001E" : "median_contract_rent",
                          "zip code tabulation area" : "zip_code"})
    
    df = clean_rent_data(df, year)

    all_year.append(df)

rent_2012_2023 = pd.concat(all_year, ignore_index = True)
rent_2012_2023.head()

Starting Year: 2012
Starting Year: 2013
Starting Year: 2014
Starting Year: 2015
Starting Year: 2016
Starting Year: 2017
Starting Year: 2018
Starting Year: 2019
Starting Year: 2020
Starting Year: 2021
Starting Year: 2022
Starting Year: 2023


Unnamed: 0,name,median_contract_rent,state,zip_code,msa_name,region_contract_rent_median,year
0,ZCTA5 40014,828,21,40014,"Louisville/Jefferson County, KY-IN MSA",270.0,2012
1,ZCTA5 40022,19,21,40022,"Louisville/Jefferson County, KY-IN MSA",270.0,2012
2,ZCTA5 40155,323,21,40155,"Louisville/Jefferson County, KY-IN MSA",270.0,2012
3,ZCTA5 40160,4245,21,40160,"Elizabethtown, KY MSA",194.0,2012
4,ZCTA5 40203,7589,21,40203,"Louisville/Jefferson County, KY-IN MSA",270.0,2012


In [4]:
rent_2012_2023 = rent_2012_2023.sort_values(by = "zip_code").reset_index(drop = True)
rent_2012_2023 = rent_2012_2023.drop(["state", "name"], axis = 1)
rent_2012_2023["median_contract_rent"] = rent_2012_2023["median_contract_rent"].astype(float)

rent_2012_2023.head()

Unnamed: 0,median_contract_rent,zip_code,msa_name,region_contract_rent_median,year
0,3076.0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",3190.0,2021
1,3212.0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",3337.0,2019
2,2903.0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",3031.5,2014
3,3257.0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",3589.0,2018
4,3147.0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",3258.0,2022


In [5]:
df_2011_renamed = df_2011.rename(columns={
    "median_contract_rent": "zip_rent_2011",
    "region_contract_rent_median": "region_rent_2011"
})

rent_2012_2023 = rent_2012_2023.merge(
    df_2011_renamed[["zip_code", "msa_name", "zip_rent_2011", "region_rent_2011"]],
    on = ["zip_code", "msa_name"],
    how = "left"
)

rent_2012_2023["gentrified"] = ((rent_2012_2023["median_contract_rent"] - rent_2012_2023["zip_rent_2011"]) > (rent_2012_2023["region_contract_rent_median"] - rent_2012_2023["region_rent_2011"]))

rent_2012_2023 = rent_2012_2023.drop(columns=["zip_rent_2011", "region_rent_2011", "region_contract_rent_median"])

In [9]:
rent_2012_2023.head()

Unnamed: 0,median_contract_rent,zip_code,msa_name,year,gentrified
0,3076.0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",2021,False
1,3212.0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",2019,False
2,2903.0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",2014,False
3,3257.0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",2018,False
4,3147.0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",2022,False


In [6]:
df_2011.to_csv("rent_2011.csv", index = False)
rent_2012_2023.to_csv("rent_2012_2023.csv", index = False)

In [7]:
#Long to wide format
wide_format = rent_2012_2023.pivot_table(index = "zip_code", columns = "year", values = "median_contract_rent").reset_index()

wide_format.head()

year,zip_code,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,602,2869.0,2880.0,2903.0,3091.0,3208.0,3225.0,3257.0,3212.0,3034.0,3076.0,3147.0,3103.0
1,603,5897.0,6168.0,6636.0,7184.0,7513.0,7767.0,8081.0,8110.0,8333.0,8041.0,8366.0,8389.0
2,610,2055.0,2202.0,2284.0,2225.0,2057.0,2040.0,2064.0,2037.0,2288.0,2482.0,2389.0,2423.0
3,612,6591.0,7021.0,7314.0,7615.0,8034.0,8111.0,7894.0,8485.0,8865.0,8853.0,8781.0,9103.0
4,616,740.0,875.0,978.0,1018.0,905.0,897.0,858.0,831.0,798.0,976.0,1027.0,1047.0


In [8]:

# for yr in range (2012, 2024):
#     df_yr_only = rent_2012_2023[rent_2012_2023["year"].isin(yr)]
    



#     zip_table = pd.pivot_table(df_yr_only, values = "median_contract_rent", index = "zip_code", columns = "year")
#     zip_table[f"rent_change_2011-{yr}_zip"] = zip_table[yr] - zip_table[2011]

#     region_table = pd.pivot_table(df_yr_only, values = "region_contract_rent_median", index = "msa_name", columns = "year")
#     region_table[f"rent_change_2011-{yr}_region"] = region_table[yr] - region_table[2011]

#     rent_2011_2023 = rent_2011_2023.merge(zip_table[f"rent_change_2011-{yr}_zip"], on = "zip_code", how = "left")
#     rent_2011_2023 = rent_2011_2023.merge(region_table[f"rent_change_2011-{yr}_region"], on = "msa_name", how = "left")

#     rent_2011_2023[f"hot_2011_{yr}"] = rent_2011_2023[f"rent_change_2011-{yr}_zip"] > rent_2011_2023[f"rent_change_2011-{yr}_region"] 
