In [94]:
import pandas as pd
from pathlib import Path
import numpy as np
import csv

In [128]:
year = 2008
rows_to_skip = 6


# Get last two digits of year as input to reading path of data file
if year-2000 < 10:
    year_str = f"0{year-2000}"
else: year_str = f"{year-2000}"


# Path to use
if year < 2008:
    path_to_use = f"Data/Income/ZIP Code {year} NY.xls"
else: path_to_use = f"Data/Income/{year_str}zp33ny.xls"
    
# Read excel file and skip title and description rows
xls_path = Path(path_to_use)
df_xls = pd.read_excel(xls_path, skiprows=rows_to_skip, index_col=None, na_values=['NA'])
df_xls.head()

Unnamed: 0,Size of Adjusted Gross Income,Zip Code,Number of returns,Number of joint returns,Number with paid preparer's signature,Number of exemptions,Number of dependents,Adjusted gross income (AGI),Salaries and wages in AGI,Taxable Interest,...,Residential Energy Credit,Child tax credit,Child and dependent care credit,Earned income credit,Excess earned income credit (refundable),Alternative minimum tax,Income tax,Total tax liability,Tax due at time of filing,Overpayments refunded
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,-1.0,-2.0,-3.0,-4.0,-5.0,-6.0,-7.0,-8.0,...,-26.0,-27.0,-28.0,-29.0,-30.0,-31.0,-32.0,-33.0,-34.0,-35.0
3,,0.0,8964732.0,2888049.0,5920985.0,16716515.0,3128140.0,623353087.0,435016931.0,18401046.0,...,1826.0,1562142.0,290186.0,3151880.0,2621839.0,3797084.0,92353069.0,96575705.0,6280099.0,-21997481.0
4,"Under $10,000",0.0,1709042.0,147950.0,1030074.0,1768273.0,278514.0,6722607.0,6582929.0,996026.0,...,0.0,0.0,0.0,642125.0,511554.0,448.0,42455.0,326276.0,106712.0,-1310765.0


In [109]:
df_xls.rename(columns={df_xls.columns[0]:"Size of Adjusted Gross Income"},inplace = True)

In [110]:
df_xls['Size of Adjusted Gross Income'] = df_xls['Size of Adjusted Gross Income'].astype(str)

df_xls.loc[df_xls['Size of Adjusted Gross Income'].str[:1]=='1','Zip'] = df_xls['Size of Adjusted Gross Income']

# In the new ZIP column, fill rows below the ZIP code
df_xls['Zip'].fillna(method='ffill',inplace=True)

df_xls.loc[:, ['Size of Adjusted Gross Income','Zip']].head(30)

Unnamed: 0,Size of Adjusted Gross Income,Zip
0,ZIP Code &,
1,Size of Adjusted,
2,Gross Income,
3,,
4,Total,
5,"Under $10,000",
6,"$10,000 under $25,000",
7,"$25,000 under $50,000",
8,"$50,000 under $75,000",
9,"$75,000 under 100,000",


In [111]:
df_xls['Zip'].fillna(method='ffill',inplace=True)
df_xls['Zip'].head(30)

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
5       NaN
6       NaN
7       NaN
8       NaN
9       NaN
10      NaN
11      NaN
12      NaN
13      NaN
14      NaN
15      NaN
16      NaN
17      NaN
18      NaN
19      NaN
20    10001
21    10001
22    10001
23    10001
24    10001
25    10001
26    10001
27    10001
28    10002
29    10002
Name: Zip, dtype: object

In [129]:
# Choose relevant columns
zip_col = np.where(df_xls.columns.str.contains('Zip'))[0][0]
income_bracket_col = np.where(df_xls.columns.str.contains('Size of Adjusted Gross Income'))[0][0]
nbr_returns_col = df_xls.columns.get_loc('Number of returns')
AGI_col = df_xls.columns.get_loc('Adjusted Gross Income')
df_xls = df_xls.iloc[:, [zip_col,income_bracket_col,nbr_returns_col,AGI_col]]
df_xls.head(30)

KeyError: 'Adjusted Gross Income'

In [113]:
# Rename columns for consistency
df_xls.columns=(['ZIP','Income Bracket','Nbr of Returns','Adjusted Gross Income'])
df_xls.head(30)

Unnamed: 0,ZIP,Income Bracket,Nbr of Returns,Adjusted Gross Income
0,,ZIP Code &,,
1,,Size of Adjusted,,
2,,Gross Income,-1.0,-4.0
3,,,,
4,,Total,8484864.0,505014120.0
5,,"Under $10,000",1724580.0,3607119.0
6,,"$10,000 under $25,000",1969428.0,33426237.0
7,,"$25,000 under $50,000",2076056.0,75318273.0
8,,"$50,000 under $75,000",1127505.0,69243431.0
9,,"$75,000 under 100,000",642782.0,55505596.0


In [114]:
# Take only the rows where it shows total of each ZIP code
# Step 1: Drop rows where ZIP is null
rows_to_drop = df_xls[df_xls['ZIP'].isnull()].index
df_xls = df_xls.drop(rows_to_drop)
df_xls.head(15)

Unnamed: 0,ZIP,Income Bracket,Nbr of Returns,Adjusted Gross Income
20,10001,10001,11553.0,976187.0
21,10001,"Under $10,000",2330.0,-5951.0
22,10001,"$10,000 under $25,000",2220.0,37644.0
23,10001,"$25,000 under $50,000",2647.0,97163.0
24,10001,"$50,000 under $75,000",1520.0,92723.0
25,10001,"$75,000 under $100,000",848.0,73270.0
26,10001,"$100,000 or more",1988.0,681338.0
27,10001,,,
28,10002,10002,40975.0,1159588.0
29,10002,"Under $10,000",14705.0,53445.0


In [115]:
# Replace

df_xls.loc[df_xls['Income Bracket']==df_xls['ZIP'],'Income Bracket'] = 'Total'

In [116]:
# Step 2: Drop rows where ZIP is 0
rows_to_drop = df_xls[df_xls['ZIP']==0].index
df_xls = df_xls.drop(rows_to_drop)
df_xls.tail(10)

Unnamed: 0,ZIP,Income Bracket,Nbr of Returns,Adjusted Gross Income
16042,14905,"$100,000 or more",516.0,103610.0
16043,14905,,,
16044,14905,,,
16045,14905,,,
16046,14905,,,
16047,14905,,,
16048,14905,,,
16049,14905,,,
16050,14905,,,
16051,14905,,,


In [118]:
# Step 2: Drop rows where ZIP is 0
rows_to_drop = df_xls[df_xls['Income Bracket']==''].index
df_xls = df_xls.drop(rows_to_drop)
df_xls.tail(10)

Unnamed: 0,ZIP,Income Bracket,Nbr of Returns,Adjusted Gross Income
16042,14905,"$100,000 or more",516.0,103610.0
16043,14905,,,
16044,14905,,,
16045,14905,,,
16046,14905,,,
16047,14905,,,
16048,14905,,,
16049,14905,,,
16050,14905,,,
16051,14905,,,


In [122]:
# Step 3: Drop null returns  - this will also drop footer rows which we don't need
rows_to_drop = df_xls[df_xls['Nbr of Returns']=='          '].index


df_xls = df_xls.drop(rows_to_drop)
df_xls.dropna()
df_xls.tail(20)

Unnamed: 0,ZIP,Income Bracket,Nbr of Returns,Adjusted Gross Income
16029,14904,"Under $10,000",1699.0,7903.0
16030,14904,"$10,000 under $25,000",2176.0,36981.0
16031,14904,"$25,000 under $50,000",1872.0,67387.0
16032,14904,"$50,000 under $75,000",794.0,48041.0
16033,14904,"$75,000 under $100,000",281.0,24024.0
16034,14904,"$100,000 or more",105.0,13948.0
16036,14905,Total,4470.0,231593.0
16037,14905,"Under $10,000",973.0,3344.0
16038,14905,"$10,000 under $25,000",954.0,16166.0
16039,14905,"$25,000 under $50,000",1017.0,37456.0


In [123]:
# Convert Nbr of Returns and AGI to float
df_xls= df_xls.replace('*         ','0')
df_xls= df_xls.replace('*              ','0')
df_xls= df_xls.replace('--        ','0')
df_xls= df_xls.replace('--             ','0')
df_xls= df_xls.replace('          ','0')
df_xls= df_xls.replace('               ','0')

df_xls['Nbr of Returns']=df_xls['Nbr of Returns'].astype('float')
df_xls['Adjusted Gross Income']=df_xls['Adjusted Gross Income'].astype('float')

In [124]:
# Add average income column and year
df_xls['Average_Income'] = df_xls['Adjusted Gross Income'] / df_xls['Nbr of Returns']
df_xls['Year'] = year
df_xls.head(10)

Unnamed: 0,ZIP,Income Bracket,Nbr of Returns,Adjusted Gross Income,Average_Income,Year
20,10001,Total,11553.0,976187.0,84.496408,2004
21,10001,"Under $10,000",2330.0,-5951.0,-2.554077,2004
22,10001,"$10,000 under $25,000",2220.0,37644.0,16.956757,2004
23,10001,"$25,000 under $50,000",2647.0,97163.0,36.706838,2004
24,10001,"$50,000 under $75,000",1520.0,92723.0,61.001974,2004
25,10001,"$75,000 under $100,000",848.0,73270.0,86.403302,2004
26,10001,"$100,000 or more",1988.0,681338.0,342.725352,2004
28,10002,Total,40975.0,1159588.0,28.29989,2004
29,10002,"Under $10,000",14705.0,53445.0,3.634478,2004
30,10002,"$10,000 under $25,000",11971.0,194118.0,16.215688,2004


In [125]:
# Set ZIP as index
df_xls = df_xls.set_index('ZIP')