# Basic Classification

This notebook would contain basic data cleaning and classification


In [None]:
import pandas as pd

# Load MSA-to-ZIP mapping data
df_msa = pd.read_csv('data/msa-by-zip.csv')

# Filter out invalid ZIP codes (less than 3 digits, e.g., P.O. Boxes, special-purpose ZIPs)
df_msa = df_msa[df_msa['ZIP CODE'] >= 601]

# Convert ZIP codes to 5-digit strings with leading zeros
df_msa['ZIP CODE'] = df_msa['ZIP CODE'].astype(str).str.zfill(5)

# Standardize column names to lowercase and replace spaces with underscores
df_msa.columns = [col.lower().replace(" ", "_") for col in df_msa.columns]

# Create a subset with only ZIP code and MSA name for merging later
df_subset = df_msa[['zip_code', 'msa_name']]

# Define data cleaning function
def clean_data(df, df_subset):
    """
    Cleans and processes a DataFrame containing ZIP Code Tabulation Area (ZCTA5) and income data, 
    merges it with a subset DataFrame, and returns the cleaned result along with the name of the median income column.

    Parameters:
    -----------
    df : pd.DataFrame
        Raw DataFrame containing income and geographic data. Assumes the first row is a header row, 
        and the first and last columns are to be dropped.
    
    df_subset : pd.DataFrame
        DataFrame containing a 'zip_code' column and an 'msa_name' column to be merged with `df`.
    
    Returns:
    --------
    tuple:
        - pd.DataFrame: Cleaned and merged DataFrame with relevant ZIP code and MSA (Metropolitan Statistical Area) data.
        - str: The column name used for median household income, depending on which format was present in the original data.
    """
    # Use the first row as the header, then reset index and drop the original header row
    df.columns = df.iloc[0]
    df = df[1:].reset_index(drop=True)

    # Drop the first and last columns (likely metadata or margin of error)
    df = df.drop([df.columns[0], df.columns[-1]], axis=1)

    # Extract 5-digit ZIP codes from the 'Geographic Area Name' column
    df["zip_code"] = df["Geographic Area Name"].str.extract(r'ZCTA5 (\d{5})')

    # Drop the 'Geographic Area Name' column after extracting the ZIP code
    df = df.drop(columns='Geographic Area Name')

    # Standardize column names: lowercase and underscores instead of spaces
    df.columns = [col.lower().replace(" ", "_") for col in df.columns]

    # Handle one possible format for the median income column
    if 'estimate!!households!!median_income_(dollars)' in df.columns:
        # Convert income to numeric, drop rows with non-numeric values
        df['col_numeric'] = pd.to_numeric(df['estimate!!households!!median_income_(dollars)'], errors='coerce')
        df = df.dropna(subset=['col_numeric']).drop(columns='col_numeric')
        df['estimate!!households!!median_income_(dollars)'] = df['estimate!!households!!median_income_(dollars)'].astype(float)
        median_key = 'estimate!!households!!median_income_(dollars)'

    # Handle alternate format of median income column
    elif 'households!!estimate!!median_income_(dollars)' in df.columns:
        df['col_numeric'] = pd.to_numeric(df['households!!estimate!!median_income_(dollars)'], errors='coerce')
        df = df.dropna(subset=['col_numeric']).drop(columns='col_numeric')
        df['households!!estimate!!median_income_(dollars)'] = df['households!!estimate!!median_income_(dollars)'].astype(float)
        median_key = 'households!!estimate!!median_income_(dollars)'

    # Merge with MSA name data on ZIP code
    df_merged = pd.merge(df, df_subset, on='zip_code', how='left')

    # Reorder columns: zip_code and msa_name first
    col = ['zip_code', 'msa_name']
    df_merged = df_merged[col + [c for c in df_merged.columns if c not in col]]

    # Remove rows that are marked as NONMETROPOLITAN or have no MSA name
    df_merged = df_merged[~df_merged['msa_name'].str.contains('NONMETROPOLITAN', na=False)]
    df_merged = df_merged.dropna(subset=['msa_name'])

    return df_merged, median_key

# Load 2011 ACS income data
df_base = pd.read_csv('data/2011_acs_income_data.csv')

# Clean and process the data using the function
df_base, med_key = clean_data(df_base, df_subset)

# Store the income column name for this specific year
key_2011 = med_key

# Preview the cleaned DataFrame
df_base.head()

# Optional: save cleaned data to CSV
# df.to_csv('2023_acs_income_data_cleaned.csv', index=False)

  df_base = pd.read_csv('data/2011_acs_income_data.csv')


Unnamed: 0,zip_code,msa_name,households!!estimate!!total,households!!margin_of_error!!total,families!!estimate!!total,families!!margin_of_error!!total,married-couple_families!!estimate!!total,married-couple_families!!margin_of_error!!total,nonfamily_households!!estimate!!total,nonfamily_households!!margin_of_error!!total,...,nonfamily_households!!estimate!!percent_imputed!!family_income_in_the_past_12_months,nonfamily_households!!margin_of_error!!percent_imputed!!family_income_in_the_past_12_months,households!!estimate!!percent_imputed!!nonfamily_income_in_the_past_12_months,households!!margin_of_error!!percent_imputed!!nonfamily_income_in_the_past_12_months,families!!estimate!!percent_imputed!!nonfamily_income_in_the_past_12_months,families!!margin_of_error!!percent_imputed!!nonfamily_income_in_the_past_12_months,married-couple_families!!estimate!!percent_imputed!!nonfamily_income_in_the_past_12_months,married-couple_families!!margin_of_error!!percent_imputed!!nonfamily_income_in_the_past_12_months,nonfamily_households!!estimate!!percent_imputed!!nonfamily_income_in_the_past_12_months,nonfamily_households!!margin_of_error!!percent_imputed!!nonfamily_income_in_the_past_12_months
1,602,"Aguadilla-Isabela-San Sebastian, PR MSA",13500,297,10589,405,7355,381,2911,309,...,(X),(X),(X),(X),(X),(X),(X),(X),8.2,(X)
2,603,"Aguadilla-Isabela-San Sebastian, PR MSA",17958,439,12961,525,8035,436,4997,470,...,(X),(X),(X),(X),(X),(X),(X),(X),13.4,(X)
4,610,"Aguadilla-Isabela-San Sebastian, PR MSA",9288,286,7238,342,4670,290,2050,221,...,(X),(X),(X),(X),(X),(X),(X),(X),8.1,(X)
5,612,"San Juan-Caguas-Guaynabo, PR MSA",23569,528,17926,550,10788,535,5643,407,...,(X),(X),(X),(X),(X),(X),(X),(X),14.0,(X)
6,616,"San Juan-Caguas-Guaynabo, PR MSA",3551,262,2662,243,1577,200,889,177,...,(X),(X),(X),(X),(X),(X),(X),(X),8.5,(X)


In [15]:
def clean_low_income(df, prices):
    """
    Converts income-related columns to numeric and calculates the total number of low-income households.

    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing income distribution across ZIP codes.
    
    prices : list of str
        List of income brackets (column suffixes) considered low income.

    Returns:
    --------
    df : pd.DataFrame
        Original DataFrame with an added 'per_low_income' column containing summed low-income household counts.
    """
    # Ensure total household column is numeric
    df['households!!estimate!!total'] = pd.to_numeric(df['households!!estimate!!total'], errors='coerce')

    # Create full column names for selected income brackets
    selected_cols = ['households!!estimate!!' + price for price in prices]

    # Convert selected columns to numeric
    for col in selected_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Sum selected low-income columns row-wise to get total low-income households
    df['per_low_income'] = df[selected_cols].sum(axis=1)

    return df

# Define the income brackets to classify as low income
prices = ['less_than_$10,000', '$10,000_to_$14,999', '$15,000_to_$24,999']

# Clean and calculate low-income sums for 2011 data
df_2011_cleaned = clean_low_income(df_base, prices)

# Get all unique MSA names
unique_msa = set(df_base['msa_name'])

# Prepare an empty DataFrame to collect at-risk classification results
track_low_income = pd.DataFrame(columns=['zip_code', 'msa_name', 'per_low_income', 'at_risk'])

# Loop over each MSA to classify ZIP codes
for msa in unique_msa:

    # Subset for current MSA
    df_specific = df_2011_cleaned[df_2011_cleaned['msa_name'] == msa]

    # Compute the median number of low-income households in this MSA
    median_per = df_specific['per_low_income'].median()

    # Classify each ZIP code in the MSA
    for _, row in df_specific.iterrows():
        at_risk = row['per_low_income'] > median_per
        new_row = {
            'zip_code': row['zip_code'],
            'msa_name': row['msa_name'],
            'per_low_income': row['per_low_income'],
            'at_risk': at_risk
        }
        # Append row to result DataFrame
        track_low_income.loc[len(track_low_income)] = new_row

# Save the at-risk classification to CSV
track_low_income.to_csv('data/low_income_classification.csv', index=False)

In [None]:
def per_change(income1, income2):
    """
    Calculates the percentage change from income1 to income2.

    Returns:
        A float representing the percent change (e.g., 0.1 = 10% increase).
    """
    return float(income2 / income1) - 1


# DataFrame to track income changes and gentrification status over time
df_track = pd.DataFrame({'zip_code': [], 'msa_name': [], 'income': [], 'gentrified': [], 'year': []})

# Dictionary to store the 2011 median income per MSA
med_income_2011 = {}

# Loop over each year from 2012 to 2023
for year in range(2012, 2024):
    print(f'starting year {year}')

    # Read ACS income data for the given year
    file_name = 'data/' + str(year) + '_acs_income_data.csv'
    df1 = pd.read_csv(file_name)

    # Clean and standardize the data
    df1, df_med_key = clean_data(df1, df_subset)

    # Loop over each MSA
    for msa in unique_msa:

        # Filter the current year's data for this MSA
        df1_specific = df1[df1['msa_name'] == msa]

        # Get the median income for the MSA in the current year
        median_income1 = df1_specific[df_med_key].median()

        # Get 2011 data for the same MSA
        df_base_specific = df_base[df_base['msa_name'] == msa]

        # Retrieve 2011 median income from cache, or compute it if not stored yet
        if msa in med_income_2011:
            median_income2 = med_income_2011[msa]
        else:
            median_income2 = df_base_specific[key_2011].median()
            med_income_2011[msa] = median_income2

        # Calculate regional (MSA-level) median income change from 2011 to current year
        region_change = per_change(median_income2, median_income1)

        # Loop through ZIP codes in this MSA for the current year
        for zip in df1_specific['zip_code']:
            # Get the current year's row for this ZIP
            row1 = df1_specific[df1_specific['zip_code'] == zip].iloc[0]

            # Find the matching 2011 row for this ZIP code
            match = df_base[df_base['zip_code'] == zip]
            if not match.empty:
                row2 = match.iloc[0]
            else:
                continue  # Skip ZIP codes not found in 2011 data

            # Calculate income change for this ZIP from 2011 to current year
            zip_change = per_change(row2[key_2011], row1[df_med_key])

            # Determine gentrification:
            # If ZIP income grew faster than the MSA median, mark it as gentrified
            if zip_change > region_change:
                new_row = {
                    'zip_code': row1['zip_code'],
                    'msa_name': row1['msa_name'],
                    'income': row1[df_med_key],
                    'gentrified': True,
                    'year': year
                }
            else:
                new_row = {
                    'zip_code': row1['zip_code'],
                    'msa_name': row1['msa_name'],
                    'income': row1[df_med_key],
                    'gentrified': False,
                    'year': year
                }

            # Append the row to the tracking DataFrame
            df_track.loc[len(df_track)] = new_row

# Export the final gentrification tracking dataset to CSV
df_track.to_csv('data/gentrified_data_2011-2023.csv', index=False)



starting year 2012


  df1 = pd.read_csv(file_name)


KeyboardInterrupt: 