# Classification for College Educated Peoples

Source: ACS


In [14]:
import pandas as pd

df_msa = pd.read_csv('data/msa-by-zip.csv')

# Filter out invalid ZIP codes (less than 3 digits, e.g., P.O. Boxes, special-purpose ZIPs)
df_msa = df_msa[df_msa['ZIP CODE'] >= 601]

# Convert ZIP codes to 5-digit strings with leading zeros
df_msa['ZIP CODE'] = df_msa['ZIP CODE'].astype(str).str.zfill(5)

# Standardize column names to lowercase and replace spaces with underscores
df_msa.columns = [col.lower().replace(" ", "_") for col in df_msa.columns]

# Create a subset with only ZIP code and MSA name for merging later
df_subset = df_msa[['zip_code', 'msa_name']]

# Define data cleaning function
def clean_data(df, df_subset):
    """
    Cleans and processes a DataFrame containing ZIP Code Tabulation Area (ZCTA5) and income data, 
    merges it with a subset DataFrame, and returns the cleaned result along with the name of the median income column.

    Parameters:
    -----------
    df : pd.DataFrame
        Raw DataFrame containing income and geographic data. Assumes the first row is a header row, 
        and the first and last columns are to be dropped.
    
    df_subset : pd.DataFrame
        DataFrame containing a 'zip_code' column and an 'msa_name' column to be merged with `df`.
    
    Returns:
    --------
    tuple:
        - pd.DataFrame: Cleaned and merged DataFrame with relevant ZIP code and MSA (Metropolitan Statistical Area) data.
        - str: The column name used for median household income, depending on which format was present in the original data.
    """
    # Use the first row as the header, then reset index and drop the original header row
    df.columns = df.iloc[0]
    df = df[1:].reset_index(drop=True)

    # Drop the first and last columns (likely metadata or margin of error)
    df = df.drop([df.columns[0], df.columns[-1]], axis=1)

    # Extract 5-digit ZIP codes from the 'Geographic Area Name' column
    df["zip_code"] = df["Geographic Area Name"].str.extract(r'ZCTA5 (\d{5})')

    # Drop the 'Geographic Area Name' column after extracting the ZIP code
    df = df.drop(columns='Geographic Area Name')

    # Standardize column names: lowercase and underscores instead of spaces
    df.columns = [col.lower().replace(" ", "_") for col in df.columns]

    df['col_numeric'] = pd.to_numeric(df['total!!estimate!!some_college_or_associate\'s_degree'], errors='coerce')
    df = df.dropna(subset=['col_numeric']).drop(columns='col_numeric')
    df['total!!estimate!!some_college_or_associate\'s_degree'] = df['total!!estimate!!some_college_or_associate\'s_degree'].astype(float)
    # Merge with MSA name data on ZIP code
    df_merged = pd.merge(df, df_subset, on='zip_code', how='left')

    # Reorder columns: zip_code and msa_name first
    col = ['zip_code', 'msa_name']
    df_merged = df_merged[col + [c for c in df_merged.columns if c not in col]]

    # Remove rows that are marked as NONMETROPOLITAN or have no MSA name
    df_merged = df_merged[~df_merged['msa_name'].str.contains('NONMETROPOLITAN', na=False)]
    df_merged = df_merged.dropna(subset=['msa_name'])

    return df_merged

In [20]:
df = pd.read_csv('data/2011_acs_education_data.csv')

df = clean_data(df, df_subset)

unique_msa = set(df['msa_name'])

output = pd.DataFrame({'zip_code':[], 'msa_name': [], 'per_college_educated':[], 'at_risk':[]})

for msa in unique_msa:
    df_specific = df[df['msa_name'] == msa]

    median_per_college_edu = df_specific['total!!estimate!!some_college_or_associate\'s_degree'].median()

    for _, row in df_specific.iterrows():
        new_row = {
            'zip_code': row['zip_code'],
            'msa_name': row['msa_name'],
            'per_college_educated': row['total!!estimate!!some_college_or_associate\'s_degree'],
            'at_risk': row['total!!estimate!!some_college_or_associate\'s_degree'] < median_per_college_edu
        }
        # Append row to result DataFrame
        output.loc[len(output)] = new_row

output.to_csv('data/college_educated_classification.csv', index=False)

  df = pd.read_csv('data/2011_acs_education_data.csv')
