In [15]:
import pandas as pd
from pathlib import Path
import numpy as np

In [16]:
def clean_up_income_data_0916(year):
    # Get last two digits of year as input to reading path of data file
    if year-2000 < 10:
        year_str = f"0{year-2000}"
    else: year_str = f"{year-2000}"
    
    # Read excel file and skip title and description rows
    xls_path = Path(f"Data/Income/{year_str}zp33ny.xls")
    df_xls = pd.read_excel(xls_path, skiprows=3, index_col=None, na_values=['NA'])
    
    # Choose relevant columns
    zip_col = df_xls.columns.get_loc('ZIP\ncode [1]')
    income_bracket_col = df_xls.columns.get_loc('Size of adjusted gross income')
    nbr_returns_col = df_xls.columns.get_loc('Number of returns')
    AGI_col = np.where(df_xls.columns.str.contains('Adjusted gross income'))[0][0]
    df_xls = df_xls.iloc[:, [zip_col,income_bracket_col,nbr_returns_col,AGI_col]]
    
    # Rename columns for consistency
    df_xls.columns=(['ZIP','Income Bracket','Nbr of Returns','Adjusted Gross Income'])
    
    # Take only the rows where it shows total of each ZIP code
    # Step 1: Drop rows where income bracket is not null
    rows_to_drop = df_xls[df_xls['Income Bracket'].notnull()].index
    df_xls = df_xls.drop(rows_to_drop)

    # Step 2: Drop rows where ZIP is null
    rows_to_drop = df_xls[df_xls['ZIP'].isnull()].index
    df_xls = df_xls.drop(rows_to_drop)
    
    # Step 3: Drop Income Bracket column - this will also drop footer rows which we don't need
    df_xls = df_xls.drop(columns='Income Bracket').dropna()
    
    # Add average income and year columns
    df_xls['Average_Income'] = df_xls['Adjusted Gross Income'] / df_xls['Nbr of Returns']
    df_xls['Year'] = year
    
    # Set ZIP as index
    df_xls = df_xls.set_index('ZIP')
    
    return df_xls

In [17]:
year = 2013
clean_up_income_data_0916(year)

Unnamed: 0_level_0,Nbr of Returns,Adjusted Gross Income,Average_Income,Year
ZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10001,13720.0,2094642.0,152.670700,2013
10002,43410.0,2002349.0,46.126446,2013
10003,29340.0,6135126.0,209.104499,2013
10004,2580.0,892142.0,345.791473,2013
10005,5660.0,5343796.0,944.133569,2013
...,...,...,...,...
14901,5420.0,188869.0,34.846679,2013
14903,3640.0,208539.0,57.290934,2013
14904,7180.0,249694.0,34.776323,2013
14905,4340.0,275019.0,63.368433,2013
