# Gentrification Classfication

This file combines all classification categories

In [None]:
import pandas as pd

# Load datasets related to different risk factors and demographic information
df_income_risk = pd.read_csv('../data/college_educated_classification.csv')  # Income-related risk classification
df_college_risk = pd.read_csv('../data/low_income_risk.csv')                # Low-income risk classification
df_minority = pd.read_csv('../data/demo_data_2011_cleaned.csv')            # Demographic data including minority percentages

# Create a new boolean column 'at_risk_min' in demographic data
# This flags ZIP codes where the percent non-white population exceeds the regional non-white average
df_minority['at_risk_min'] = df_minority['percent_nonwhite'] > df_minority['region_nonwhite']

# Remove the 'year' column as it is not needed for the merge or analysis
df_minority = df_minority.drop(columns='year')

# Merge income risk and college education risk datasets on 'zip_code' and 'msa_name'
# Using inner join to keep only ZIP codes present in both datasets
merged_df_risk = df_income_risk.merge(df_college_risk, on=['zip_code', 'msa_name'], how='inner')

# Merge the above result with the minority demographic dataset on the same keys
# This creates a combined dataframe including all three risk indicators
merged_df_risk = merged_df_risk.merge(df_minority, on=['zip_code', 'msa_name'], how='inner')

# Display the final merged dataframe containing all risk factors for each ZIP code and MSA
merged_df_risk


Unnamed: 0,zip_code,msa_name,per_college_educated,at_risk_college,per_low_income,at_risk_income,percent_nonwhite,region_nonwhite,at_risk_min
0,53001,"Sheboygan, WI MSA",17.0,True,14.8,True,1.0,2.6,False
1,53011,"Sheboygan, WI MSA",16.4,True,16.0,True,2.9,2.6,True
2,53013,"Sheboygan, WI MSA",27.0,False,13.5,False,2.8,2.6,True
3,53020,"Sheboygan, WI MSA",21.8,False,9.5,False,1.4,2.6,False
4,53023,"Sheboygan, WI MSA",11.0,True,13.4,False,23.7,2.6,True
...,...,...,...,...,...,...,...,...,...
17028,31038,"Atlanta-Sandy Springs-Marietta, GA MSA",15.3,True,31.0,True,20.5,31.0,False
17029,31064,"Atlanta-Sandy Springs-Marietta, GA MSA",18.3,True,33.5,True,30.9,31.0,False
17030,31085,"Atlanta-Sandy Springs-Marietta, GA MSA",17.8,True,7.7,False,20.6,31.0,False
17031,31816,"Atlanta-Sandy Springs-Marietta, GA MSA",13.1,True,41.6,True,40.9,31.0,True


In [None]:
# see if all three conditions are satisfied
merged_df_risk['at_risk_overall'] = merged_df_risk['at_risk_college'] & merged_df_risk['at_risk_income'] & merged_df_risk['at_risk_min']

merged_df_risk

Unnamed: 0,zip_code,msa_name,per_college_educated,at_risk_college,per_low_income,at_risk_income,percent_nonwhite,region_nonwhite,at_risk_min,at_risk_overall
0,53001,"Sheboygan, WI MSA",17.0,True,14.8,True,1.0,2.6,False,False
1,53011,"Sheboygan, WI MSA",16.4,True,16.0,True,2.9,2.6,True,True
2,53013,"Sheboygan, WI MSA",27.0,False,13.5,False,2.8,2.6,True,False
3,53020,"Sheboygan, WI MSA",21.8,False,9.5,False,1.4,2.6,False,False
4,53023,"Sheboygan, WI MSA",11.0,True,13.4,False,23.7,2.6,True,False
...,...,...,...,...,...,...,...,...,...,...
17028,31038,"Atlanta-Sandy Springs-Marietta, GA MSA",15.3,True,31.0,True,20.5,31.0,False,False
17029,31064,"Atlanta-Sandy Springs-Marietta, GA MSA",18.3,True,33.5,True,30.9,31.0,False,False
17030,31085,"Atlanta-Sandy Springs-Marietta, GA MSA",17.8,True,7.7,False,20.6,31.0,False,False
17031,31816,"Atlanta-Sandy Springs-Marietta, GA MSA",13.1,True,41.6,True,40.9,31.0,True,True


In [None]:
# Load CSV file containing classification data for low income over multiple years
df_income_gent = pd.read_csv('../data/low_income_classification_by_year.csv')

# Load CSV file containing classification data for college-educated population over time, related to gentrification
df_college_gent = pd.read_csv('../data/college_educated_classification_gentrified_overtime.csv')

# Load CSV file containing rent data from 2012 to 2023
df_rent = pd.read_csv('../data/rent_2012_2023.csv')

# Remove the first column of df_rent, likely because it contains an unnecessary index or redundant data
# iloc[:, 1:] means select all rows and all columns starting from the second column (index 1) to the end
df_rent = df_rent.iloc[:, 1:]

# Display the cleaned rent dataframe to inspect the data after removing the first column
df_rent


Unnamed: 0,median_contract_rent,zip_code,msa_name,year,gentrified_rent
0,3076.0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",2021,False
1,3212.0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",2019,False
2,2903.0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",2014,False
3,3257.0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",2018,False
4,3147.0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",2022,False
...,...,...,...,...,...
212774,0.0,99790,"Fairbanks, AK MSA",2017,False
212775,0.0,99790,"Fairbanks, AK MSA",2021,True
212776,0.0,99790,"Fairbanks, AK MSA",2018,False
212777,0.0,99790,"Fairbanks, AK MSA",2022,True


In [None]:
# Merge the income gentrification data with college-educated gentrification data
# Join on 'zip_code', 'msa_name', and 'year' to align data by location and time
# Using inner join to keep only matching records in both dataframes
merged_df_gent = df_income_gent.merge(df_college_gent, on=['zip_code', 'msa_name', 'year'], how='inner')

# Further merge the combined dataframe with rent data using the same keys
# This adds rent-related gentrification info for the corresponding ZIP, MSA, and year
merged_df_gent = merged_df_gent.merge(df_rent, on=['zip_code', 'msa_name', 'year'], how='inner')

# Create a new column 'gentrified' that is True only if all three gentrification indicators are True
# Uses logical AND (&) to combine the boolean columns indicating gentrification by income, college education, and rent
merged_df_gent['gentrified'] = merged_df_gent['gentrified_income'] & merged_df_gent['gentrified_college'] & merged_df_gent['gentrified_rent']

# Display the final merged dataframe containing combined gentrification indicators and overall classification
merged_df_gent


Unnamed: 0,zip_code,msa_name,income,gentrified_income,year,per_college_educated,gentrified_college,median_contract_rent,gentrified_rent,gentrified
0,602,"Aguadilla-Isabela-San Sebastian, PR MSA",15106.0,False,2012,20.100000,True,2869.0,True,False
1,603,"Aguadilla-Isabela-San Sebastian, PR MSA",15079.0,False,2012,21.000000,True,5897.0,False,False
2,610,"Aguadilla-Isabela-San Sebastian, PR MSA",16923.0,False,2012,14.700000,False,2055.0,False,False
3,631,"Aguadilla-Isabela-San Sebastian, PR MSA",11651.0,True,2012,14.700000,True,272.0,False,False
4,662,"Aguadilla-Isabela-San Sebastian, PR MSA",14735.0,True,2012,17.200000,True,5257.0,False,False
...,...,...,...,...,...,...,...,...,...,...
199405,99705,"Fairbanks, AK MSA",96966.0,True,2023,23.235542,True,1702.0,True,True
199406,99709,"Fairbanks, AK MSA",82991.0,True,2023,39.784946,True,4452.0,True,True
199407,99712,"Fairbanks, AK MSA",103494.0,True,2023,34.573561,True,683.0,True,True
199408,99714,"Fairbanks, AK MSA",71042.0,False,2023,24.044734,False,38.0,True,False


In [None]:
# Merge the gentrification data with the risk factor data on 'zip_code' and 'msa_name'
# Using a left join to keep all records from merged_df_gent and add matching risk info where available
df_merged_total = merged_df_gent.merge(merged_df_risk, on=['zip_code', 'msa_name'], how='left')

# Update the 'gentrified' column to only True if both:
# - The original gentrified flag (from income, college, rent)
# - And the overall risk flag ('at_risk_overall') are True
df_merged_total['gentrified'] = df_merged_total['gentrified'] & df_merged_total['at_risk_overall']

# Filter to keep only rows where 'gentrified' is True (i.e., locations confirmed as gentrified and at risk)
df_gentrified = df_merged_total[df_merged_total['gentrified'] == True]

# For each ZIP code, find the earliest year it was classified as gentrified
first_years = df_gentrified.groupby('zip_code')['year'].min().reset_index()

# Rename the column to clarify it represents the first year of gentrification detection
first_years.rename(columns={'year': 'first_gentrified_year'}, inplace=True)

# Merge this earliest gentrification year info back into the main dataframe
df = df_merged_total.merge(first_years, on='zip_code', how='left')

# Save the complete dataframe with classifications and first gentrification years to a CSV file
df.to_csv('../data/classification_data_raw.csv', index=False)


In [None]:
# Select rows where the year matches the first year the ZIP code was classified as gentrified
# This gives one record per ZIP code representing the initial gentrification year
df_gent = df[df['year'] == df['first_gentrified_year']]

# Select rows where the ZIP code has no recorded gentrification year (not gentrified)
df_nongent = df[df['first_gentrified_year'].isna()]

# For non-gentrified ZIP codes, keep only the earliest year record available
# Sort by year to ensure earliest records come first, then drop duplicates keeping the first occurrence per ZIP
df_nongent = df_nongent.sort_values('year').drop_duplicates('zip_code', keep='first')

# Combine the gentrified and non-gentrified dataframes into one final dataset
# ignore_index=True resets the row indices for the combined dataframe
df_final = pd.concat([df_gent, df_nongent], ignore_index=True)

# Print the columns of both dataframes to verify they match
print(df_gent.columns)
print(df_nongent.columns)

# Print the number of gentrified and non-gentrified ZIP code records for quick summary
print(len(df_gent), len(df_nongent))

# Save the final combined classification data to a CSV file for downstream analysis or modeling
df_final.to_csv('../data/classification_data.csv', index=False)


Index(['zip_code', 'msa_name', 'income', 'gentrified_income', 'year',
       'per_college_educated_x', 'gentrified_college', 'median_contract_rent',
       'gentrified_rent', 'gentrified', 'per_college_educated_y',
       'at_risk_college', 'per_low_income', 'at_risk_income',
       'percent_nonwhite', 'region_nonwhite', 'at_risk_min', 'at_risk_overall',
       'first_gentrified_year'],
      dtype='object')
Index(['zip_code', 'msa_name', 'income', 'gentrified_income', 'year',
       'per_college_educated_x', 'gentrified_college', 'median_contract_rent',
       'gentrified_rent', 'gentrified', 'per_college_educated_y',
       'at_risk_college', 'per_low_income', 'at_risk_income',
       'percent_nonwhite', 'region_nonwhite', 'at_risk_min', 'at_risk_overall',
       'first_gentrified_year'],
      dtype='object')
1448 15567


In [None]:
# Select only the ZIP code, MSA name, and the first year the ZIP was gentrified
# Drop duplicates to have one row per ZIP code with its first gentrification year
df_first_gentrified = df_final[['zip_code', 'msa_name', 'first_gentrified_year']].drop_duplicates()

# Add a new column 'start_year' and set it to 2011 for all rows
# This might represent the start of the study period or baseline year
df_first_gentrified['start_year'] = 2011

# Ensure ZIP codes are strings and pad them with leading zeros to have 5 characters consistently
df_first_gentrified['zip_code'] = df_first_gentrified['zip_code'].astype(str).str.zfill(5)

# Display the resulting dataframe with ZIP, MSA, first gentrified year, and start year
df_first_gentrified


Unnamed: 0,zip_code,msa_name,first_gentrified_year,start_year
0,01453,"Worcester, MA MSA",2012.0,2011
1,01605,"Worcester, MA MSA",2012.0,2011
2,01222,"Pittsfield, MA MSA",2012.0,2011
3,01255,"Pittsfield, MA MSA",2012.0,2011
4,01702,"Boston-Cambridge-Quincy, MA-NH MSA",2012.0,2011
...,...,...,...,...
17010,15625,"Pittsburgh, PA MSA",,2011
17011,36568,"Mobile, AL MSA",,2011
17012,58202,"Grand Forks, ND-MN MSA",,2011
17013,15691,"Pittsburgh, PA MSA",,2011
