In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

In [33]:
# For input to reading files based on year
year = 2009

if year-2000 < 10:
    year_str = f"0{year-2000}"
else: year_str = f"{year-2000}"
        
year_str

'09'

In [34]:
# Read excel file and skip title and description rows
xls_path = Path(f"Data/Income/{year_str}zp33ny.xls")
df_xls = pd.read_excel(xls_path, skiprows=3, index_col=None, na_values=['NA'])
df_xls.head()

Unnamed: 0,ZIP\ncode [1],Size of adjusted gross income,Number of returns,Number of joint returns,Number with paid preparer's signature,Number of exemptions,Number of dependents,Adjusted gross income (AGI),Salaries and wages in AGI,Unnamed: 9,...,Alternative minimum tax,Unnamed: 62,Income tax [6],Unnamed: 64,Total tax liability [7],Unnamed: 66,Tax due at time of filing [8],Unnamed: 68,Overpayments refunded [9],Unnamed: 70
0,,,,,,,,,Number of returns,Amount,...,Number of returns,Amount,Number of returns,Amount,Number of returns,Amount,Number of returns,Amount,Number of returns,Amount
1,,,-1.0,-2.0,-3.0,-4.0,-5.0,-6.0,-7,-8,...,-60,-61,-62,-63,-64,-65,-66,-67,-68,-69
2,0.0,,8859870.0,2885809.0,5831350.0,16840512.0,5270429.0,591058301.0,7279318,414308145,...,466119,3476047,5964627,81598109,6641628,85640574,1419456,5803715,6896726,23044299
3,0.0,"$1 under $25,000",3555603.0,496249.0,2241627.0,5286324.0,1551565.0,42351360.0,2519194,29460761,...,874,601,1174529,766535,1749990,1615697,449035,258237,2784252,6008426
4,0.0,"$25,000 under $50,000",2107465.0,501518.0,1342830.0,3897170.0,1195917.0,76646332.0,1865259,63488214,...,734,1209,1684304,4328850,1762100,4769972,251980,354726,1809800,5077936


In [26]:
# Choose relevant columns
zip_col = df_xls.columns.get_loc('ZIP\ncode [1]')
income_bracket_col = df_xls.columns.get_loc('Size of adjusted gross income')
nbr_returns_col = df_xls.columns.get_loc('Number of returns')
AGI_col = np.where(df_xls.columns.str.contains('Adjusted gross income'))[0][0]
df_xls = df_xls.iloc[:, [zip_col,income_bracket_col,nbr_returns_col,AGI_col]]
df_xls

Unnamed: 0,ZIP\ncode [1],Size of adjusted gross income,Number of returns,Adjusted gross income (AGI)
0,,,,
1,,,-1.0,-6.0
2,0,,8859870.0,591058301.0
3,0,"$1 under $25,000",3555603.0,42351360.0
4,0,"$25,000 under $50,000",2107465.0,76646332.0
...,...,...,...,...
12386,[9] The amount of overpayments the tax filer ...,,,
12387,NOTE: This table presents aggregates of all re...,,,
12388,"In general, during administrative or Master Fi...",,,
12389,Detail may not add to totals because of rounding.,,,


In [27]:
# Rename columns for consistency
df_xls.columns=(['ZIP','Income Bracket','Nbr of Returns','Adjusted Gross Income'])
df_xls.head(20)

Unnamed: 0,ZIP,Income Bracket,Nbr of Returns,Adjusted Gross Income
0,,,,
1,,,-1.0,-6.0
2,0.0,,8859870.0,591058301.0
3,0.0,"$1 under $25,000",3555603.0,42351360.0
4,0.0,"$25,000 under $50,000",2107465.0,76646332.0
5,0.0,"$50,000 under $75,000",1196060.0,73491779.0
6,0.0,"$75,000 under $100,000",737127.0,63848720.0
7,0.0,"$100,000 under $200,000",942941.0,127927947.0
8,0.0,"$200,000 or more",320674.0,206792163.0
9,,,,


In [28]:
# Take only the rows where it shows total of each ZIP code
# Step 1: Drop rows where ZIP is null

rows_to_drop = df_xls[df_xls['ZIP'].isnull()].index
df_xls = df_xls.drop(rows_to_drop)
df_xls.head(10)

Unnamed: 0,ZIP,Income Bracket,Nbr of Returns,Adjusted Gross Income
2,0,,8859870.0,591058301.0
3,0,"$1 under $25,000",3555603.0,42351360.0
4,0,"$25,000 under $50,000",2107465.0,76646332.0
5,0,"$50,000 under $75,000",1196060.0,73491779.0
6,0,"$75,000 under $100,000",737127.0,63848720.0
7,0,"$100,000 under $200,000",942941.0,127927947.0
8,0,"$200,000 or more",320674.0,206792163.0
10,10001,,12393.0,1644776.0
11,10001,"$1 under $25,000",4233.0,48988.0
12,10001,"$25,000 under $50,000",2573.0,95329.0


In [29]:
# Step 2: Drop rows where ZIP is 0
rows_to_drop = df_xls[df_xls['ZIP']==0].index
df_xls = df_xls.drop(rows_to_drop)
df_xls.tail(20)

Unnamed: 0,ZIP,Income Bracket,Nbr of Returns,Adjusted Gross Income
12371,14905,"$1 under $25,000",1753.0,20716.0
12372,14905,"$25,000 under $50,000",947.0,34843.0
12373,14905,"$50,000 under $75,000",655.0,40418.0
12374,14905,"$75,000 under $100,000",435.0,37543.0
12375,14905,"$100,000 under $200,000",519.0,69240.0
12376,14905,"$200,000 or more",134.0,53091.0
12377,** - Not shown to avoid disclosure of informat...,,,
12378,[1] The ZIP Code is based on the 5-digit ZIP c...,,,
12379,"[2] ""Qualified dividends"" are ordinary dividen...",,,
12380,"[3] Includes the Alaskan permanent fund, repor...",,,


In [30]:
# Step 3: Drop null returns  - this will also drop footer rows which we don't need
rows_to_drop = df_xls[df_xls['Nbr of Returns'].isnull()].index
df_xls = df_xls.drop(rows_to_drop)
df_xls.tail(10)

Unnamed: 0,ZIP,Income Bracket,Nbr of Returns,Adjusted Gross Income
12366,14904,"$75,000 under $100,000",405.0,34952.0
12367,14904,"$100,000 under $200,000",224.0,28999.0
12368,14904,"$200,000 or more",0.0,0.0
12370,14905,,4443.0,255851.0
12371,14905,"$1 under $25,000",1753.0,20716.0
12372,14905,"$25,000 under $50,000",947.0,34843.0
12373,14905,"$50,000 under $75,000",655.0,40418.0
12374,14905,"$75,000 under $100,000",435.0,37543.0
12375,14905,"$100,000 under $200,000",519.0,69240.0
12376,14905,"$200,000 or more",134.0,53091.0


In [31]:
# Add average income column and year
df_xls['Average_Income'] = df_xls['Adjusted Gross Income'] / df_xls['Nbr of Returns']
df_xls['Year'] = year
df_xls

Unnamed: 0,ZIP,Income Bracket,Nbr of Returns,Adjusted Gross Income,Average_Income,Year
10,10001,,12393.0,1644776.0,132.718147,2009
11,10001,"$1 under $25,000",4233.0,48988.0,11.572880,2009
12,10001,"$25,000 under $50,000",2573.0,95329.0,37.049747,2009
13,10001,"$50,000 under $75,000",1767.0,108677.0,61.503679,2009
14,10001,"$75,000 under $100,000",1111.0,95936.0,86.351035,2009
...,...,...,...,...,...,...
12372,14905,"$25,000 under $50,000",947.0,34843.0,36.793031,2009
12373,14905,"$50,000 under $75,000",655.0,40418.0,61.706870,2009
12374,14905,"$75,000 under $100,000",435.0,37543.0,86.305747,2009
12375,14905,"$100,000 under $200,000",519.0,69240.0,133.410405,2009


In [32]:
# Set ZIP as index
df_xls = df_xls.set_index('ZIP')
df_xls

Unnamed: 0_level_0,Income Bracket,Nbr of Returns,Adjusted Gross Income,Average_Income,Year
ZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10001,,12393.0,1644776.0,132.718147,2009
10001,"$1 under $25,000",4233.0,48988.0,11.572880,2009
10001,"$25,000 under $50,000",2573.0,95329.0,37.049747,2009
10001,"$50,000 under $75,000",1767.0,108677.0,61.503679,2009
10001,"$75,000 under $100,000",1111.0,95936.0,86.351035,2009
...,...,...,...,...,...
14905,"$25,000 under $50,000",947.0,34843.0,36.793031,2009
14905,"$50,000 under $75,000",655.0,40418.0,61.706870,2009
14905,"$75,000 under $100,000",435.0,37543.0,86.305747,2009
14905,"$100,000 under $200,000",519.0,69240.0,133.410405,2009
