# Basic Classification

This notebook would contain basic data cleaning and classification


In [4]:
import pandas as pd

df_msa = pd.read_csv('data/msa-by-zip.csv')

df_msa = df_msa[df_msa['ZIP CODE'] >= 601]

df_msa['ZIP CODE'] = df_msa['ZIP CODE'].astype(str).str.zfill(5)

df_msa.columns = [col.lower().replace(" ", "_") for col in df_msa.columns]

df_subset = df_msa[['zip_code', 'msa_name']]

def clean_data(df, df_subset):
    df.columns = df.iloc[0]
    df = df[1:].reset_index(drop=True)
    df = df.drop([df.columns[0], df.columns[-1]], axis=1)

    df["zip_code"] = df["Geographic Area Name"].str.extract(r'ZCTA5 (\d{5})')

    df = df.drop(columns='Geographic Area Name')

    df.columns = [col.lower().replace(" ", "_") for col in df.columns]

    if 'estimate!!households!!median_income_(dollars)' in df.columns:

        df['col_numeric'] = pd.to_numeric(df['estimate!!households!!median_income_(dollars)'], errors='coerce')

        df = df.dropna(subset=['col_numeric'])

        df = df.drop(columns='col_numeric')

        df['estimate!!households!!median_income_(dollars)'] = df['estimate!!households!!median_income_(dollars)'].astype(float)
    
    elif 'households!!estimate!!median_income_(dollars)' in df.columns:
        df['col_numeric'] = pd.to_numeric(df['households!!estimate!!median_income_(dollars)'], errors='coerce')

        df = df.dropna(subset=['col_numeric'])

        df = df.drop(columns='col_numeric')

        df['households!!estimate!!median_income_(dollars)'] = df['households!!estimate!!median_income_(dollars)'].astype(float)

    df_merged = pd.merge(df, df_subset, on='zip_code', how='left')

    col = ['zip_code', 'msa_name']
    df_merged = df_merged[col + [c for c in df_merged.columns if c not in col]]

    df_merged = df_merged[~df_merged['msa_name'].str.contains('NONMETROPOLITAN', na=False)]

    df_merged = df_merged.dropna(subset=['msa_name'])

    return df_merged

df1 = pd.read_csv('data/2023_acs_income_data.csv')

df1 = clean_data(df1, df_subset)

df2 = pd.read_csv('data/2011_acs_income_data.csv')

df2 = clean_data(df2, df_subset)

df2.head()

# df.to_csv('2023_acs_income_data_cleaned.csv', index=False)

  df1 = pd.read_csv('data/2023_acs_income_data.csv')
  df2 = pd.read_csv('data/2011_acs_income_data.csv')


Unnamed: 0,zip_code,msa_name,households!!estimate!!total,households!!margin_of_error!!total,families!!estimate!!total,families!!margin_of_error!!total,married-couple_families!!estimate!!total,married-couple_families!!margin_of_error!!total,nonfamily_households!!estimate!!total,nonfamily_households!!margin_of_error!!total,...,nonfamily_households!!estimate!!percent_imputed!!family_income_in_the_past_12_months,nonfamily_households!!margin_of_error!!percent_imputed!!family_income_in_the_past_12_months,households!!estimate!!percent_imputed!!nonfamily_income_in_the_past_12_months,households!!margin_of_error!!percent_imputed!!nonfamily_income_in_the_past_12_months,families!!estimate!!percent_imputed!!nonfamily_income_in_the_past_12_months,families!!margin_of_error!!percent_imputed!!nonfamily_income_in_the_past_12_months,married-couple_families!!estimate!!percent_imputed!!nonfamily_income_in_the_past_12_months,married-couple_families!!margin_of_error!!percent_imputed!!nonfamily_income_in_the_past_12_months,nonfamily_households!!estimate!!percent_imputed!!nonfamily_income_in_the_past_12_months,nonfamily_households!!margin_of_error!!percent_imputed!!nonfamily_income_in_the_past_12_months
1,602,"Aguadilla-Isabela-San Sebastian, PR MSA",13500,297,10589,405,7355,381,2911,309,...,(X),(X),(X),(X),(X),(X),(X),(X),8.2,(X)
2,603,"Aguadilla-Isabela-San Sebastian, PR MSA",17958,439,12961,525,8035,436,4997,470,...,(X),(X),(X),(X),(X),(X),(X),(X),13.4,(X)
4,610,"Aguadilla-Isabela-San Sebastian, PR MSA",9288,286,7238,342,4670,290,2050,221,...,(X),(X),(X),(X),(X),(X),(X),(X),8.1,(X)
5,612,"San Juan-Caguas-Guaynabo, PR MSA",23569,528,17926,550,10788,535,5643,407,...,(X),(X),(X),(X),(X),(X),(X),(X),14.0,(X)
6,616,"San Juan-Caguas-Guaynabo, PR MSA",3551,262,2662,243,1577,200,889,177,...,(X),(X),(X),(X),(X),(X),(X),(X),8.5,(X)


In [9]:
def per_change(income1, income2):
    return float(income2/income1)-1


df_track = pd.DataFrame({'zip_code': [], 'msa_name': [], 'income': [], 'gentrified':[]})

unique_msa = set(df1['msa_name'])

for msa in unique_msa:
    print(msa)

    df1_specific = df1[df1['msa_name'] == msa]

    median_income1 = df1_specific['estimate!!households!!median_income_(dollars)'].median()

    df2_specific = df2[df2['msa_name'] == msa]

    median_income2 = df2_specific['households!!estimate!!median_income_(dollars)'].median()

    region_change = per_change(median_income2, median_income1)

    for zip in df1_specific['zip_code']:
        row1 = df1_specific[df1_specific['zip_code'] == zip].iloc[0]

        match = df2_specific[df2_specific['zip_code'] == zip]
        if not match.empty:
            row2 = match.iloc[0]

        zip_change = per_change(row2['households!!estimate!!median_income_(dollars)'], row1['estimate!!households!!median_income_(dollars)'])
        
        if zip_change > region_change:
            new_row = {'zip_code': row1['zip_code'], 'msa_name': row1['msa_name'], 'income': row1['estimate!!households!!median_income_(dollars)'], 'gentrified': True}
            df_track.loc[len(df_track)] = new_row
        else:
            new_row = {'zip_code': row1['zip_code'], 'msa_name': row1['msa_name'], 'income': row1['estimate!!households!!median_income_(dollars)'], 'gentrified': False}
            df_track.loc[len(df_track)] = new_row

df_track.to_csv('data/gentrified_data_Austin.csv', index=False)



St. Joseph, MO-KS MSA
Alexandria, LA MSA
St. Louis, MO-IL MSA
Chicago-Naperville-Joliet, IL-IN-WI MSA
Warner Robins, GA MSA
Miami-Fort Lauderdale-Pompano Beach, FL MSA
Yuba City, CA MSA
Oshkosh-Neenah, WI MSA
Abilene, TX MSA
Sheboygan, WI MSA
Louisville/Jefferson County, KY-IN MSA
Springfield, MA MSA
Charlottesville, VA MSA
Charleston, WV MSA
San Jose-Sunnyvale-Santa Clara, CA MSA
South Bend-Mishawaka, IN-MI MSA
Monroe, MI MSA
Jackson, MS MSA
Lebanon, PA MSA
Sebastian-Vero Beach, FL MSA
Sumter, SC MSA
Kingston, NY MSA
Barnstable Town, MA MSA
Knoxville, TN MSA
Columbia, MO MSA
Ponce, PR MSA
Las Cruces, NM MSA
Asheville, NC MSA
Naples-Marco Island, FL MSA
Williamsport, PA MSA
Bend, OR MSA
Racine, WI MSA
Glens Falls, NY MSA
Omaha-Council Bluffs, NE-IA MSA
Lafayette, LA MSA
Birmingham-Hoover, AL MSA
Eau Claire, WI MSA
Oxnard-Thousand Oaks-Ventura, CA MSA
Napa, CA MSA
Burlington-South Burlington, VT MSA
Goldsboro, NC MSA
Beaumont-Port Arthur, TX MSA
Bismarck, ND MSA
Kennewick-Richland-Pasco