In [146]:
import pandas as pd
from pathlib import Path

In [230]:
# Read excel file and skip title and description rows
xls_path = Path("Data/Income/16zp33ny.xls")
df_xls = pd.read_excel(xls_path, skiprows=4, index_col=None, na_values=['NA'])
df_xls.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Total,...,Number of returns.58,Amount.58,Number of returns.59,Amount.59,Number of returns.60,Amount.60,Number of returns.61,Amount.61,Number of returns.62,Amount.62
0,,,-1.0,-2.0,-3.0,-4.0,-5.0,-6.0,-7.0,-8.0,...,-134.0,-135.0,-136.0,-137.0,-138.0,-139.0,-140.0,-141.0,-142.0,-143.0
1,0.0,Total,9468060.0,4848730.0,2939200.0,1480010.0,6084350.0,17229510.0,5371740.0,205040.0,...,7372450.0,131579060.0,326320.0,964402.0,324020.0,2088223.0,1765120.0,10011123.0,7193680.0,26254821.0
2,0.0,"$1 under $25,000",3445310.0,2354910.0,445530.0,595140.0,2114250.0,4860280.0,1503570.0,139400.0,...,1713380.0,1659527.0,20.0,8.0,0.0,0.0,441730.0,387684.0,2729280.0,5662193.0
3,0.0,"$25,000 under $50,000",2123960.0,1161980.0,441060.0,474430.0,1285470.0,3760760.0,1210680.0,53620.0,...,1815900.0,4999491.0,0.0,0.0,0.0,0.0,290240.0,481914.0,1800810.0,4661819.0
4,0.0,"$50,000 under $75,000",1297130.0,644420.0,398720.0,216490.0,829250.0,2370910.0,677620.0,10330.0,...,1256620.0,7566246.0,30.0,18.0,20.0,42.0,258680.0,583707.0,1012590.0,2930279.0


In [231]:
# Choose relevant columns and rename accordingly
df_xls = df_xls.iloc[:, [0,1,17,18]]
df_xls.columns = ['ZIP','Income Bracket','Number of Returns','Amount']
df_xls.head(15)

Unnamed: 0,ZIP,Income Bracket,Number of Returns,Amount
0,,,-16.0,-17.0
1,0.0,Total,9468050.0,795119743.0
2,0.0,"$1 under $25,000",3445290.0,42897756.0
3,0.0,"$25,000 under $50,000",2123960.0,78326525.0
4,0.0,"$50,000 under $75,000",1297130.0,80718724.0
5,0.0,"$75,000 under $100,000",825300.0,72248027.0
6,0.0,"$100,000 under $200,000",1242990.0,172388638.0
7,0.0,"$200,000 or more",533380.0,348540073.0
8,,,,
9,10001.0,,14520.0,2323084.0


In [232]:
# Take only the rows where it shows total of each ZIP code
# Step 1: Drop rows where income bracket is not null

rows_to_drop = df_xls[df_xls['Income Bracket'].notnull()].index
df_xls = df_xls.drop(rows_to_drop)
df_xls.head(10)

Unnamed: 0,ZIP,Income Bracket,Number of Returns,Amount
0,,,-16.0,-17.0
8,,,,
9,10001.0,,14520.0,2323084.0
16,,,,
17,10002.0,,42180.0,2313723.0
24,,,,
25,10003.0,,28660.0,6720746.0
32,,,,
33,10004.0,,2480.0,830828.0
40,,,,


In [233]:
# Step 2: Drop rows where ZIP is null
rows_to_drop = df_xls[df_xls['ZIP'].isnull()].index
df_xls = df_xls.drop(rows_to_drop)
df_xls

Unnamed: 0,ZIP,Income Bracket,Number of Returns,Amount
9,10001,,14520.0,2323084.0
17,10002,,42180.0,2313723.0
25,10003,,28660.0,6720746.0
33,10004,,2480.0,830828.0
41,10005,,5940.0,3171561.0
...,...,...,...,...
12340,[12] The amount of overpayments the tax filer...,,,
12341,NOTE: This table presents aggregates of all re...,,,
12342,This table is based on tax returns as initiall...,,,
12343,Detail may not add to totals because of rounding.,,,


In [234]:
# Step 3: Drop Income Bracket column - this will also drop footer rows which we don't need
df_xls = df_xls.drop(columns='Income Bracket').dropna()
df_xls

Unnamed: 0,ZIP,Number of Returns,Amount
9,10001,14520.0,2323084.0
17,10002,42180.0,2313723.0
25,10003,28660.0,6720746.0
33,10004,2480.0,830828.0
41,10005,5940.0,3171561.0
...,...,...,...
12289,14901,5280.0,193462.0
12297,14903,3600.0,224870.0
12305,14904,6730.0,246083.0
12313,14905,4260.0,284749.0


In [235]:
# Add average income column and year
df_xls['Average_Income'] = df_xls['Amount'] / df_xls['Number of Returns']
df_xls['Year'] = 2016
df_xls

Unnamed: 0,ZIP,Number of Returns,Amount,Average_Income,Year
9,10001,14520.0,2323084.0,159.992011,2016
17,10002,42180.0,2313723.0,54.853556,2016
25,10003,28660.0,6720746.0,234.499163,2016
33,10004,2480.0,830828.0,335.011290,2016
41,10005,5940.0,3171561.0,533.932828,2016
...,...,...,...,...,...
12289,14901,5280.0,193462.0,36.640530,2016
12297,14903,3600.0,224870.0,62.463889,2016
12305,14904,6730.0,246083.0,36.565082,2016
12313,14905,4260.0,284749.0,66.842488,2016


In [238]:
# Set ZIP as index
df_xls = df_xls.set_index('ZIP')
df_xls

Unnamed: 0_level_0,Number of Returns,Amount,Average_Income,Year
ZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10001,14520.0,2323084.0,159.992011,2016
10002,42180.0,2313723.0,54.853556,2016
10003,28660.0,6720746.0,234.499163,2016
10004,2480.0,830828.0,335.011290,2016
10005,5940.0,3171561.0,533.932828,2016
...,...,...,...,...
14901,5280.0,193462.0,36.640530,2016
14903,3600.0,224870.0,62.463889,2016
14904,6730.0,246083.0,36.565082,2016
14905,4260.0,284749.0,66.842488,2016
