# Basic Classification by Income

This notebook would contain basic data cleaning and classification

Source ACS

In [1]:
import pandas as pd

df_msa = pd.read_csv('../data/msa-by-zip.csv')

# Filter out invalid ZIP codes (less than 3 digits, e.g., P.O. Boxes, special-purpose ZIPs)
df_msa = df_msa[df_msa['ZIP CODE'] >= 601]

# Convert ZIP codes to 5-digit strings with leading zeros
df_msa['ZIP CODE'] = df_msa['ZIP CODE'].astype(str).str.zfill(5)

# Standardize column names to lowercase and replace spaces with underscores
df_msa.columns = [col.lower().replace(" ", "_") for col in df_msa.columns]

# Create a subset with only ZIP code and MSA name for merging later
df_subset = df_msa[['zip_code', 'msa_name']]

# Define data cleaning function
def clean_data(df, df_subset):
    """
    Cleans and processes a DataFrame containing ZIP Code Tabulation Area (ZCTA5) and income data, 
    merges it with a subset DataFrame, and returns the cleaned result along with the name of the median income column.

    Parameters:
    -----------
    df : pd.DataFrame
        Raw DataFrame containing income and geographic data. Assumes the first row is a header row, 
        and the first and last columns are to be dropped.
    
    df_subset : pd.DataFrame
        DataFrame containing a 'zip_code' column and an 'msa_name' column to be merged with `df`.
    
    Returns:
    --------
    tuple:
        - pd.DataFrame: Cleaned and merged DataFrame with relevant ZIP code and MSA (Metropolitan Statistical Area) data.
        - str: The column name used for median household income, depending on which format was present in the original data.
    """
    # Use the first row as the header, then reset index and drop the original header row
    df.columns = df.iloc[0]
    df = df[1:].reset_index(drop=True)

    # Drop the first and last columns (likely metadata or margin of error)
    df = df.drop([df.columns[0], df.columns[-1]], axis=1)

    # Extract 5-digit ZIP codes from the 'Geographic Area Name' column
    df["zip_code"] = df["Geographic Area Name"].str.extract(r'ZCTA5 (\d{5})')

    # Drop the 'Geographic Area Name' column after extracting the ZIP code
    df = df.drop(columns='Geographic Area Name')

    # Standardize column names: lowercase and underscores instead of spaces
    df.columns = [col.lower().replace(" ", "_") for col in df.columns]

    # Handle one possible format for the median income column
    if 'estimate!!households!!median_income_(dollars)' in df.columns:
        # Convert income to numeric, drop rows with non-numeric values
        df['col_numeric'] = pd.to_numeric(df['estimate!!households!!median_income_(dollars)'], errors='coerce')
        df = df.dropna(subset=['col_numeric']).drop(columns='col_numeric')
        df['estimate!!households!!median_income_(dollars)'] = df['estimate!!households!!median_income_(dollars)'].astype(float)
        median_key = 'estimate!!households!!median_income_(dollars)'

    # Handle alternate format of median income column
    elif 'households!!estimate!!median_income_(dollars)' in df.columns:
        df['col_numeric'] = pd.to_numeric(df['households!!estimate!!median_income_(dollars)'], errors='coerce')
        df = df.dropna(subset=['col_numeric']).drop(columns='col_numeric')
        df['households!!estimate!!median_income_(dollars)'] = df['households!!estimate!!median_income_(dollars)'].astype(float)
        median_key = 'households!!estimate!!median_income_(dollars)'

    # Merge with MSA name data on ZIP code
    df_merged = pd.merge(df, df_subset, on='zip_code', how='left')

    # Reorder columns: zip_code and msa_name first
    col = ['zip_code', 'msa_name']
    df_merged = df_merged[col + [c for c in df_merged.columns if c not in col]]

    # Remove rows that are marked as NONMETROPOLITAN or have no MSA name
    df_merged = df_merged[~df_merged['msa_name'].str.contains('NONMETROPOLITAN', na=False)]
    df_merged = df_merged.dropna(subset=['msa_name'])

    return df_merged, median_key

df_base = pd.read_csv('../data/2011_acs_income_data.csv')

# Clean and process the data using the function
df_base, med_key = clean_data(df_base, df_subset)

# Store the income column name for this specific year
key_2011 = med_key

# Preview the cleaned DataFrame
df_base.head()

# Optional: save cleaned data to CSV
# df.to_csv('2023_acs_income_data_cleaned.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: '../data/msa-by-zip.csv'

In [None]:
cols = ['households!!estimate!!less_than_$10,000', 'households!!estimate!!$10,000_to_$14,999', 'households!!estimate!!$15,000_to_$24,999']
for c in cols:
    df_base[c] = pd.to_numeric(df_base[c], errors='coerce')
    df_base = df_base.dropna(subset=[c])

df_selected_columns = df_base[cols]

row_sums = df_selected_columns.sum(axis=1)

df_base['per_low_income'] = row_sums

unique_msa = df_base['msa_name'].unique()

output = []

for msa in unique_msa:
    df_specific_msa = df_base[df_base['msa_name'] == msa].set_index('zip_code')

    median_low_income = df_base['per_low_income'].median()

    for zip in list(df_specific_msa.index):
        per_low_income = df_specific_msa.loc[zip, 'per_low_income']

        output.append({'zip_code': zip, 'msa_name': msa, 'per_low_income': per_low_income, 'at_risk_income': per_low_income > median_low_income})


df_output = pd.DataFrame.from_records(output)
df_output.to_csv('../data/low_income_classification_risk.csv', index=False)


In [None]:
import pandas as pd

def per_change(income1, income2):
    """Calculates percent change from income1 to income2 (e.g., 0.1 = 10% increase)."""
    return (income2 / income1) - 1 if income1 != 0 else 0

# Prepare base 2011 data
df_base = df_base.set_index('zip_code')
med_income_2011 = df_base.groupby('msa_name')[key_2011].median().to_dict()

# Placeholder for results
records = []

# Loop from 2012 to 2023
for year in range(2012, 2024):
    print(f"Processing year {year}...")
    
    df_current = pd.read_csv(f'../data/{year}_acs_income_data.csv')
    df_current, df_med_key = clean_data(df_current, df_subset)
    df_current = df_current.set_index('zip_code')
    
    # Get unique MSAs
    msas = df_current['msa_name'].unique()

    for msa in msas:
        df_current_msa = df_current[df_current['msa_name'] == msa]
        median_income_current = df_current_msa[df_med_key].median()
        median_income_base = med_income_2011.get(msa, None)

        # Skip MSA if 2011 data not available
        if median_income_base is None:
            continue

        region_change = per_change(median_income_base, median_income_current)

        # Join current and base data on ZIP for this MSA
        df_base_msa = df_base[df_base['msa_name'] == msa]
        shared_zips = df_current_msa.index.intersection(df_base_msa.index)

        for zip_code in shared_zips:
            income_current = df_current_msa.loc[zip_code, df_med_key]
            income_base = df_base_msa.loc[zip_code, key_2011]

            zip_change = per_change(income_base, income_current)
            gentrified = zip_change > region_change

            records.append({
                'zip_code': zip_code,
                'msa_name': msa,
                'income': income_current,
                'gentrified': gentrified,
                'year': year
            })

# Final DataFrame and export
df_track = pd.DataFrame.from_records(records)
df_track.to_csv('../data/low_income_classification_by_year.csv', index=False)


Processing year 2012...


  df_current = pd.read_csv(f'../data/{year}_acs_income_data.csv')


Processing year 2013...


  df_current = pd.read_csv(f'../data/{year}_acs_income_data.csv')


Processing year 2014...


  df_current = pd.read_csv(f'../data/{year}_acs_income_data.csv')


Processing year 2015...


  df_current = pd.read_csv(f'../data/{year}_acs_income_data.csv')


Processing year 2016...


  df_current = pd.read_csv(f'../data/{year}_acs_income_data.csv')


Processing year 2017...


  df_current = pd.read_csv(f'../data/{year}_acs_income_data.csv')


Processing year 2018...


  df_current = pd.read_csv(f'../data/{year}_acs_income_data.csv')


Processing year 2019...


  df_current = pd.read_csv(f'../data/{year}_acs_income_data.csv')


Processing year 2020...


  df_current = pd.read_csv(f'../data/{year}_acs_income_data.csv')


Processing year 2021...


  df_current = pd.read_csv(f'../data/{year}_acs_income_data.csv')


Processing year 2022...


  df_current = pd.read_csv(f'../data/{year}_acs_income_data.csv')


Processing year 2023...


  df_current = pd.read_csv(f'../data/{year}_acs_income_data.csv')
