# Classification with Forecasted 2035 Data

This script uses Forecasted 2035 data to classify if a zip code is gentrified or not

In [None]:
import pandas as pd

# Load the base dataset containing ZIP code-level demographic data from 2012
base_data = pd.read_csv('../data/all_gentrification_prediction.csv')

# Load the forecasted data for the year 2035
forecasted_data = pd.read_csv('../data/forecasted_data_2035.csv')

# Select relevant columns from the base dataset
df_base = base_data[['zip_code', 'msa_name', 'income_2012', 'per_college_educated_2012', 'median_contract_rent_2012']]

# Rename columns for clarity and consistency
df_base.columns = ['zip_code', 'msa_name', 'income_2012', 'college_2012', 'rent_2012']

# Load a mapping of ZIP codes to MSA (Metropolitan Statistical Area)
df_msa = pd.read_csv('../data/msa-by-zip.csv')

# Filter out ZIP codes that are likely invalid or non-residential (e.g., < 601)
df_msa = df_msa[df_msa['ZIP CODE'] >= 601]

# Standardize column names: lowercase and replace spaces with underscores
df_msa.columns = [col.lower().replace(" ", "_") for col in df_msa.columns]

# Keep only the ZIP code and MSA name columns for merging
df_subset = df_msa[['zip_code', 'msa_name']]

# Filter forecasted data to only include entries for the year 2035
df_forecast = forecasted_data[forecasted_data['year'] == 2035]

# Merge forecasted data with the ZIP-to-MSA mapping to associate MSA names
df_forecast = pd.merge(left=df_forecast, right=df_subset, on='zip_code', how='left')

# Drop the 'year' column since it's now redundant (all entries are for 2035)
df_forecast = df_forecast.drop(columns='year')

# Rename columns in the forecasted dataset for consistency with base dataset
df_forecast.columns = ['zip_code', 'income_2035', 'college_2035', 'rent_2035', 'msa_name']

# Remove any duplicate ZIP codes in the forecast data
df_forecast = df_forecast.drop_duplicates(subset=['zip_code'])

# Merge the 2012 base data with the 2035 forecast data using ZIP code and MSA name
df_merged = pd.merge(df_base, df_forecast, on=['zip_code', 'msa_name'], how='inner')

# Display the final merged DataFrame
df_merged


Unnamed: 0,zip_code,msa_name,income_2012,college_2012,rent_2012,income_2035,college_2035,rent_2035
0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",15106.0,20.1,2869.0,21878,24.0,4602
1,603,"Aguadilla-Isabela-San Sebastian, PR MSA",15079.0,21.0,5897.0,21839,25.1,9459
2,610,"Aguadilla-Isabela-San Sebastian, PR MSA",16923.0,14.7,2055.0,24510,17.6,3296
3,631,"Aguadilla-Isabela-San Sebastian, PR MSA",11651.0,14.7,272.0,16874,17.6,436
4,662,"Aguadilla-Isabela-San Sebastian, PR MSA",14735.0,17.2,5257.0,21341,20.6,8432
...,...,...,...,...,...,...,...,...
15252,99674,"Anchorage, AK MSA",50962.0,20.9,132.0,73808,25.0,212
15253,99676,"Anchorage, AK MSA",34576.0,22.3,88.0,50076,26.7,141
15254,99683,"Anchorage, AK MSA",50441.0,10.4,43.0,73054,12.4,69
15255,99688,"Anchorage, AK MSA",60030.0,18.1,58.0,86941,21.6,93


This cells finds the change in each metric from 2012 to 2035 and determines if the growth outgrows the regional median

If so, the zip code is gentrified in 2035

In [None]:
# Calculate the absolute change from 2012 to 2035 for income, college education rate, and rent
df_merged['income_change'] = df_merged['income_2035'] - df_merged['income_2012']
df_merged['college_change'] = df_merged['college_2035'] - df_merged['college_2012']
df_merged['rent_change'] = df_merged['rent_2035'] - df_merged['rent_2012']

# Group by MSA and calculate the median change in each metric across ZIP codes within that MSA
msa_medians = df_merged.groupby('msa_name')[['income_change', 'rent_change', 'college_change']].median().reset_index()

# Rename the columns to clearly indicate that these are MSA-level median changes
msa_medians = msa_medians.rename(columns={
    'income_change': 'msa_income_median',
    'rent_change': 'msa_rent_median',
    'college_change': 'msa_college_median'
})

# Merge the MSA median change data back into the main dataset
# This allows us to compare each ZIP code's change to its MSA's median
df_final = pd.merge(df_merged, msa_medians, on='msa_name', how='left')

# Define a ZIP code as gentrified if its changes exceed the MSA median in all three dimensions:
# - Income increase
# - College-educated population increase
# - Rent increase
df_final['gentrified'] = (
    (df_final['income_change'] > df_final['msa_income_median']) &
    (df_final['college_change'] > df_final['msa_college_median']) &
    (df_final['rent_change'] > df_final['msa_rent_median'])
)

# Display the final DataFrame, which now includes the 'gentrified' classification
df_final


Unnamed: 0,zip_code,msa_name,income_2012,college_2012,rent_2012,income_2035,college_2035,rent_2035,income_change,college_change,rent_change,msa_income_median,msa_rent_median,msa_college_median,gentrified
0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",15106.0,20.1,2869.0,21878,24.0,4602,6772.0,3.9,1733.0,6683.0,1681.5,3.45,True
1,603,"Aguadilla-Isabela-San Sebastian, PR MSA",15079.0,21.0,5897.0,21839,25.1,9459,6760.0,4.1,3562.0,6683.0,1681.5,3.45,True
2,610,"Aguadilla-Isabela-San Sebastian, PR MSA",16923.0,14.7,2055.0,24510,17.6,3296,7587.0,2.9,1241.0,6683.0,1681.5,3.45,False
3,631,"Aguadilla-Isabela-San Sebastian, PR MSA",11651.0,14.7,272.0,16874,17.6,436,5223.0,2.9,164.0,6683.0,1681.5,3.45,False
4,662,"Aguadilla-Isabela-San Sebastian, PR MSA",14735.0,17.2,5257.0,21341,20.6,8432,6606.0,3.4,3175.0,6683.0,1681.5,3.45,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15252,99674,"Anchorage, AK MSA",50962.0,20.9,132.0,73808,25.0,212,22846.0,4.1,80.0,31948.5,1043.5,5.15,False
15253,99676,"Anchorage, AK MSA",34576.0,22.3,88.0,50076,26.7,141,15500.0,4.4,53.0,31948.5,1043.5,5.15,False
15254,99683,"Anchorage, AK MSA",50441.0,10.4,43.0,73054,12.4,69,22613.0,2.0,26.0,31948.5,1043.5,5.15,False
15255,99688,"Anchorage, AK MSA",60030.0,18.1,58.0,86941,21.6,93,26911.0,3.5,35.0,31948.5,1043.5,5.15,False
