# Classification for College Educated Peoples

Source: ACS


In [None]:
import pandas as pd

df_msa = pd.read_csv('data/msa-by-zip.csv')

# Filter out invalid ZIP codes (less than 3 digits, e.g., P.O. Boxes, special-purpose ZIPs)
df_msa = df_msa[df_msa['ZIP CODE'] >= 601]

# Convert ZIP codes to 5-digit strings with leading zeros
df_msa['ZIP CODE'] = df_msa['ZIP CODE'].astype(str).str.zfill(5)

# Standardize column names to lowercase and replace spaces with underscores
df_msa.columns = [col.lower().replace(" ", "_") for col in df_msa.columns]

# Create a subset with only ZIP code and MSA name for merging later
df_subset = df_msa[['zip_code', 'msa_name']]

keywords = ['some_college_or_associate\'s_degree']
def name_contains_all(col_name, keywords):
    return all(kw.lower() in col_name.lower() for kw in keywords)

def clean_data(df, df_subset):
    """
    Cleans and processes a DataFrame containing ZIP Code Tabulation Area (ZCTA5) and education data,
    and merges it with a DataFrame containing ZIP-to-MSA mapping.

    This function:
    - Parses headers and ZIP codes from raw Census-like data files.
    - Standardizes column names and formats ZIP codes.
    - Computes the percentage of the population with a bachelor's degree or higher.
    - Merges in MSA (Metropolitan Statistical Area) names based on ZIP code.
    - Filters out non-metropolitan areas.

    Parameters
    ----------
    df : pd.DataFrame
        Raw DataFrame containing geographic and education/income data.
        Assumes the first row contains actual column headers.
        The first and last columns are dropped, typically as they are metadata.

    df_subset : pd.DataFrame
        A subset DataFrame with two columns: 'zip_code' and 'msa_name',
        used to map ZIP codes to their respective MSAs.

    Returns
    -------
    pd.DataFrame
        A cleaned and merged DataFrame with:
        - Standardized ZIP codes
        - Mapped MSA names
        - Computed percentage of college-educated population (`per_college`)
        - Filtered out non-metropolitan ZIP codes
    """

    # Use the first row as the actual column headers
    df.columns = df.iloc[0]
    df = df[1:].reset_index(drop=True)

    # Drop first and last columns (often metadata or margin of error)
    df = df.drop([df.columns[0], df.columns[-1]], axis=1)

    # Extract 5-digit ZIP code from 'Geographic Area Name' column
    df["zip_code"] = df["Geographic Area Name"].str.extract(r'ZCTA5 (\d{5})')

    # Drop the original 'Geographic Area Name' column
    df = df.drop(columns='Geographic Area Name')

    # Standardize column names: lowercase and replace spaces with underscores
    df.columns = [col.lower().replace(" ", "_") for col in df.columns]

    # Compute `per_college`: percentage of population with at least a bachelor's degree
    # Multiple formats are supported depending on dataset structure

    # Format 1: Direct total estimate with percent bachelor's or higher
    if 'total!!estimate!!some_college_or_associate\'s_degree' in df.columns:
        df['per_college'] = pd.to_numeric(df['total!!estimate!!percent_bachelor\'s_degree_or_higher'], errors='coerce')
        df = df.dropna(subset=['per_college'])
        df['per_college'] = df['per_college'].astype(float)

    # Format 2: Estimate by age group and degree level
    elif 'estimate!!total!!age_by_educational_attainment!!population_18_to_24_years!!some_college_or_associate\'s_degree' in df.columns:
        df['18_24_bachelors'] = pd.to_numeric(df['estimate!!total!!age_by_educational_attainment!!population_18_to_24_years!!bachelor\'s_degree_or_higher'], errors='coerce')
        df['over_25_bachelors'] = pd.to_numeric(df['estimate!!total!!age_by_educational_attainment!!population_25_years_and_over!!bachelor\'s_degree'], errors='coerce')
        df['over_25_masters'] = pd.to_numeric(df['estimate!!total!!age_by_educational_attainment!!population_25_years_and_over!!graduate_or_professional_degree'], errors='coerce')
        df['18_24_pop'] = pd.to_numeric(df['estimate!!total!!age_by_educational_attainment!!population_18_to_24_years'], errors='coerce')
        df['over_25_pop'] = pd.to_numeric(df['estimate!!total!!age_by_educational_attainment!!population_25_years_and_over'], errors='coerce')
        df['per_college'] = ((df['18_24_bachelors'] + df['over_25_bachelors'] + df['over_25_masters']) /
                             (df['18_24_pop'] + df['over_25_pop'])) * 100

    # Format 3: Alternate label order with similar content
    elif 'total!!estimate!!population_18_to_24_years!!some_college_or_associate\'s_degree' in df.columns:
        df['18_24_bachelors'] = pd.to_numeric(df['total!!estimate!!population_18_to_24_years!!bachelor\'s_degree_or_higher'], errors='coerce')
        df['over_25_bachelors'] = pd.to_numeric(df['total!!estimate!!population_25_years_and_over!!bachelor\'s_degree'], errors='coerce')
        df['over_25_masters'] = pd.to_numeric(df['total!!estimate!!population_25_years_and_over!!graduate_or_professional_degree'], errors='coerce')
        df['18_24_pop'] = pd.to_numeric(df['total!!estimate!!population_18_to_24_years'], errors='coerce')
        df['over_25_pop'] = pd.to_numeric(df['total!!estimate!!population_25_years_and_over'], errors='coerce')
        df['per_college'] = ((df['18_24_bachelors'] + df['over_25_bachelors'] + df['over_25_masters']) /
                             (df['18_24_pop'] + df['over_25_pop'])) * 100

    # Format 4: Another alternate label structure
    elif 'estimate!!total!!population_18_to_24_years!!some_college_or_associate\'s_degree' in df.columns:
        df['18_24_bachelors'] = pd.to_numeric(df['estimate!!total!!population_18_to_24_years!!bachelor\'s_degree_or_higher'], errors='coerce')
        df['over_25_bachelors'] = pd.to_numeric(df['estimate!!total!!population_25_years_and_over!!bachelor\'s_degree'], errors='coerce')
        df['over_25_masters'] = pd.to_numeric(df['estimate!!total!!population_25_years_and_over!!graduate_or_professional_degree'], errors='coerce')
        df['18_24_pop'] = pd.to_numeric(df['estimate!!total!!population_18_to_24_years'], errors='coerce')
        df['over_25_pop'] = pd.to_numeric(df['estimate!!total!!population_25_years_and_over'], errors='coerce')
        df['per_college'] = ((df['18_24_bachelors'] + df['over_25_bachelors'] + df['over_25_masters']) /
                             (df['18_24_pop'] + df['over_25_pop'])) * 100

    # Merge education data with MSA information using ZIP codes
    df_merged = pd.merge(df, df_subset, on='zip_code', how='left')

    # Reorder columns for clarity: zip_code and msa_name first
    col = ['zip_code', 'msa_name']
    df_merged = df_merged[col + [c for c in df_merged.columns if c not in col]]

    # Remove rows that have no MSA or are labeled as NONMETROPOLITAN
    df_merged = df_merged[~df_merged['msa_name'].str.contains('NONMETROPOLITAN', na=False)]
    df_merged = df_merged.dropna(subset=['msa_name'])

    return df_merged


In [None]:
# Load base year (2011) ACS education data
df_base = pd.read_csv('data/2011_acs_education_data.csv')

# Clean and process the raw data, merging ZIP codes with MSA names and computing education levels
df_base = clean_data(df_base, df_subset)

# Get a set of unique MSAs (Metropolitan Statistical Areas) from the cleaned data
unique_msa = set(df_base['msa_name'])

# Dictionary to store the median percentage of college-educated individuals per MSA
median_msa_college_base = {}

# Initialize an empty DataFrame to store output results
output = pd.DataFrame({
    'zip_code': [], 
    'msa_name': [], 
    'per_college_educated': [], 
    'at_risk': []
})

# Loop through each unique MSA
for msa in unique_msa:
    # Filter data to only include rows from the current MSA
    df_specific = df_base[df_base['msa_name'] == msa]

    # Calculate the median percentage of college-educated individuals within this MSA
    median_per_college_edu = df_specific['per_college'].median()

    # Store the median value in the dictionary for future reference (optional use)
    median_msa_college_base[msa] = median_per_college_edu

    # Iterate through each ZIP code (row) in the current MSA
    for _, row in df_specific.iterrows():
        # Create a new dictionary for each row with classification info
        new_row = {
            'zip_code': row['zip_code'],
            'msa_name': row['msa_name'],
            'per_college_educated': row['per_college'],
            # Flag ZIP codes as "at risk" if they fall below the MSA's median
            'at_risk': row['per_college'] < median_per_college_edu
        }

        # Append the new row to the output DataFrame
        output.loc[len(output)] = new_row

# Export the final output DataFrame to a CSV file
output.to_csv('data/college_educated_classification_base_year.csv', index=False)


  df_base = pd.read_csv('data/2011_acs_education_data.csv')


In [None]:
# Initialize the output DataFrame to store results across years
output = pd.DataFrame({
    'zip_code': [], 
    'msa_name': [], 
    'per_college_educated': [], 
    'gentrified': [], 
    'year': []
})

# Iterate over each year from 2012 to 2023 (inclusive)
for year in range(2012, 2024):
    print(year)  # Log current year being processed

    # Construct file path for the current year's ACS education data
    file_name = 'data/' + str(year) + '_acs_education_data.csv'

    # Read and clean the data for the current year
    df = pd.read_csv(file_name)
    df = clean_data(df, df_subset)

    # Loop through each MSA of interest (those present in the 2011 base data)
    for msa in unique_msa:

        # Filter current year's data to include only this MSA
        df_metro = df[df['msa_name'] == msa]

        # Compute the median percentage of college-educated individuals in this MSA for the current year
        median_now = df_metro['per_college'].median()

        # Retrieve the 2011 (base year) median for this MSA
        median_then = median_msa_college_base[msa]

        # Calculate the change in MSA-level education attainment since 2011
        region_change = median_now - median_then

        # Iterate over ZIP codes in the current year's MSA data
        for zip in df_metro['zip_code']:
            # Extract the current year data for this ZIP code
            row1 = df_metro[df_metro['zip_code'] == zip].iloc[0]

            # Find the corresponding 2011 (base year) data for this ZIP
            match = df_base[df_base['zip_code'] == zip]
            if not match.empty:
                row2 = match.iloc[0]  # Get the 2011 record for comparison
            else:
                continue  # Skip this ZIP if no matching 2011 data exists

            # Determine if the ZIP is "gentrified":
            # ZIP is gentrified if its college-educated share increased more than the overall MSA's increase
            new_row = {
                'zip_code': row1['zip_code'],
                'msa_name': row1['msa_name'],
                'per_college_educated': row1['per_college'],
                'gentrified': row1['per_college'] - row2['per_college'] > region_change,
                'year': year
            }

            # Append this result to the output DataFrame
            output.loc[len(output)] = new_row

# After all years are processed, save the full results to CSV
output.to_csv('data/college_educated_classification_gentrified_overtime.csv', index=False)


2012


  df = pd.read_csv(file_name)


2013


  df = pd.read_csv(file_name)


2014


  df = pd.read_csv(file_name)


2015


  df = pd.read_csv(file_name)


2016


  df = pd.read_csv(file_name)


2017


  df = pd.read_csv(file_name)


2018


  df = pd.read_csv(file_name)


2019


  df = pd.read_csv(file_name)


2020


  df = pd.read_csv(file_name)


2021


  df = pd.read_csv(file_name)


2022


  df = pd.read_csv(file_name)


2023


  df = pd.read_csv(file_name)
