In [9]:
import pandas as pd
from pathlib import Path
import numpy as np
import csv
import os
import matplotlib.pyplot as plt
import hvplot.pandas
import plotly.express as px
from matplotlib import cm
from panel.interact import interact
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing

%matplotlib inline

In [10]:
zip_neigh = Path("boroughs_neighborhood_block_zipcode.csv")
nyc_data = pd.DataFrame(pd.read_csv(zip_neigh))
nyc_data = nyc_data.iloc[:, [0,2]]
nyc_data['NEIGHBORHOOD'] = nyc_data['NEIGHBORHOOD'].str.strip()
nyc_data.drop_duplicates(inplace=True)
nyc_data.columns=(['Neighborhood','ZIP'])
nyc_data['ZIP'] = nyc_data['ZIP'].astype(int).astype(str)
nyc_data

Unnamed: 0,Neighborhood,ZIP
0,BATHGATE,10457
8,BATHGATE,10458
11,BATHGATE,10456
25,BATHGATE,10460
35,BAYCHESTER,10469
...,...,...
45800,SPRINGFIELD GARDENS,11430
45971,ST. ALBANS,11435
46979,EAST ELMHURST,11417
47627,LITTLE NECK,11355


In [11]:
def clean_up_income_data(year):
    if (year >=2004 and year <= 2006) or year == 2008:
        rows_to_skip = 6
    elif year==2007:
        rows_to_skip = 5
    elif year > 2008:
        rows_to_skip = 3

    # Get last two digits of year as input to reading path of data file
    if year-2000 < 10:
        year_str = f"0{year-2000}"
    else: year_str = f"{year-2000}"

    # Path to use
    if year < 2008:
        path_to_use = f"Data/Income/ZIP Code {year} NY.xls"
    else: path_to_use = f"Data/Income/{year_str}zp33ny.xls"

    # Read excel file and skip title and description rows
    xls_path = Path(path_to_use)
    df_xls = pd.read_excel(xls_path, skiprows=rows_to_skip, index_col=None, na_values=['NA'])
    
    if year == 2004 or year == 2005:
        df_xls.rename(columns={ df_xls.columns[0]: "Size of Adjusted Gross Income" }, inplace = True)
        df_xls['Size of Adjusted Gross Income'] = df_xls['Size of Adjusted Gross Income'].astype(str)
        df_xls.loc[df_xls['Size of Adjusted Gross Income'].str[:1]=='1','Zip'] = df_xls['Size of Adjusted Gross Income']
        df_xls['Zip'].fillna(method='ffill',inplace=True)
        df_xls.loc[:, ['Size of Adjusted Gross Income','Zip']].head(30)
    
    if year == 2006:
        df_xls.rename(columns={ df_xls.columns[0]: "Size of Adjusted Gross Income" }, inplace = True)
        df_xls.rename(columns={ df_xls.columns[1]: "Zip" }, inplace = True)
    
    
    # Choose relevant columns
    if year >= 2004 and year <= 2006:
        zip_col = np.where(df_xls.columns.str.contains('Zip'))[0][0]
        income_bracket_col = np.where(df_xls.columns.str.contains('Size of Adjusted Gross Income'))[0][0]
    elif year >= 2007 and year <= 2008:
        zip_col = df_xls.columns.get_loc('Zip Code')
        income_bracket_col = np.where(df_xls.columns.str.contains('Size of Adjusted Gross Income'))[0][0]        
    else:
        zip_col = np.where(df_xls.columns.str.contains('ZIP'))[0][0]
        income_bracket_col = np.where(df_xls.columns.str.contains('Size of adjusted gross income'))[0][0]
    
    
    if year >= 2004 and year <= 2006:
        AGI_col = df_xls.columns.get_loc('Adjusted Gross Income')
    else: AGI_col = np.where(df_xls.columns.str.contains('Adjusted gross income'))[0][0]
        
    nbr_returns_col = df_xls.columns.get_loc('Number of returns')
    
    df_xls = df_xls.iloc[:, [zip_col,income_bracket_col,nbr_returns_col,AGI_col]]
    
    # Rename columns for consistency
    df_xls.columns=(['ZIP','Income Bracket','Nbr of Returns','Adjusted Gross Income'])
    
    # Take only the rows where it shows total of each ZIP code
    # Step 1: Drop rows where ZIP is null
    df_xls.loc[df_xls['Income Bracket']==df_xls['ZIP'],'Income Bracket'] = 'Total'
    df_xls = df_xls.drop(df_xls[df_xls['ZIP'].isnull()].index)
    df_xls = df_xls.drop(df_xls[df_xls['ZIP']==0].index)
    df_xls = df_xls.drop(df_xls[df_xls['Nbr of Returns'].isnull()].index)
    df_xls = df_xls.drop(df_xls[df_xls['Nbr of Returns']=='          '].index)
    df_xls = df_xls.drop(df_xls[df_xls['ZIP']=='TOTAL'].index)

    df_xls.dropna()
    
    # Convert Nbr of Returns and AGI to float
    df_xls= df_xls.replace('.','0')
    df_xls= df_xls.replace('*','0')
    df_xls= df_xls.replace('*         ','0')
    df_xls= df_xls.replace('*              ','0')
    df_xls= df_xls.replace('--        ','0')
    df_xls= df_xls.replace('--             ','0')
    df_xls= df_xls.replace('          ','0')
    df_xls= df_xls.replace('               ','0')
    
    df_xls['Income Bracket'].fillna('Total',inplace=True)
    
    
    df_xls['Nbr of Returns']=df_xls['Nbr of Returns'].astype('float')
    df_xls['Adjusted Gross Income']=df_xls['Adjusted Gross Income'].astype('float')
    df_xls['ZIP'] = df_xls['ZIP'].astype(int)
    df_xls['ZIP'] = df_xls['ZIP'].astype(str)

    # Add average income column and year
    df_xls['Average_Income'] = df_xls['Adjusted Gross Income'] / df_xls['Nbr of Returns']
    df_xls['Year'] = year
    
    return df_xls

In [12]:
df = pd.concat([
    clean_up_income_data(2004),
    clean_up_income_data(2005),
    clean_up_income_data(2006),
    clean_up_income_data(2007),
    clean_up_income_data(2008),
    clean_up_income_data(2009),
    clean_up_income_data(2010),
    clean_up_income_data(2011),
    clean_up_income_data(2012),
    clean_up_income_data(2013),
    clean_up_income_data(2014),
    clean_up_income_data(2015),
    clean_up_income_data(2016)],
    axis='rows',join='inner')

In [13]:
df_total = df[df['Income Bracket']=='Total']
df_total = df_total.iloc[:, [0,4,5]]
#df_pivot = df_total.pivot(index='Year', columns='ZIP', values=['Average_Income']

In [18]:
income_neighborhood = pd.merge(df_total, nyc_data, on='ZIP', how='inner')
income_neighborhood = income_neighborhood.drop(columns=['ZIP'])
income_neighborhood = income_neighborhood.set_index('Year')
income_neighborhood = income_neighborhood.groupby(['Year','Neighborhood']).mean()
income_neighborhood.to_csv('Data/Dashboard/dash_income_yearly.csv')
income_neighborhood.tail(75)

Unnamed: 0_level_0,Unnamed: 1_level_0,Average_Income
Year,Neighborhood,Unnamed: 2_level_1
2016,MIDTOWN EAST,297.262694
2016,MIDTOWN WEST,294.202816
2016,MIDWOOD,58.810700
2016,MILL BASIN,65.454065
2016,MORNINGSIDE HEIGHTS,77.620821
2016,...,...
2016,WINDSOR TERRACE,80.351665
2016,WOODHAVEN,41.517217
2016,WOODLAWN,50.415666
2016,WOODSIDE,54.656987


In [19]:
income_neighborhood.hvplot(groupby='Neighborhood',width=800,height=400)

In [37]:
income_neighborhood_reset = income_neighborhood.reset_index()
# def neighborhood_stats(Neighborhood):
#         mttn_slice = income_neighborhood_reset[ income_neighborhood_reset['Neighborhood'].isin([Neighborhood])]
#         pct_change = mttn_slice['Average_Income'].pct_change()
#         avg_pct_change = pct_change.mean()
#         stdev = pct_change.std()
#         return [avg_pct_change , stdev]
    
# mttn_stats = {}
# for Neighborhood in income_neighborhood_reset['Neighborhood']:
#     mttn_stats[Neighborhood] = neighborhood_stats(Neighborhood)
    
# mttn_df = pd.DataFrame(mttn_stats).melt()
# mttn_df.columns=(['Neighborhood','Ave_Income_ROC'])
# mttn_df

In [138]:
neigh_slice = income_neighborhood_reset[income_neighborhood_reset['Neighborhood'].isin(['FLATIRON'])]
neigh_slice.set_index('Year', inplace=True)
last_income = neigh_slice['Average_Income'][-1:]
last_income.values[0]


313.98642778739577

In [139]:


def cumulative_income_stats(Neighborhood):
    neigh_slice = income_neighborhood_reset[income_neighborhood_reset['Neighborhood'].isin([Neighborhood])]
    neigh_slice.set_index('Year', inplace=True)
    last_income = neigh_slice['Average_Income'][-1:].values[0]
    returns = neigh_slice['Average_Income'].pct_change()
    avg_pct_change = round(returns.mean(),4)
    pct_change_3yr = round(returns[-3:].mean(),4)
    comparison_3yr = round(pct_change_3yr - avg_pct_change,4)
    pct_change_5yr = round(returns[-5:].mean(),4)
    comparison_5yr = round(pct_change_5yr - avg_pct_change,4)
    return [last_income, avg_pct_change, pct_change_3yr, comparison_3yr, pct_change_5yr, comparison_5yr]

In [172]:
nyc_stats = {}
for Neighborhood in income_neighborhood_reset['Neighborhood']:
    nyc_stats[Neighborhood] = cumulative_income_stats(Neighborhood)

nyc_stats_df = pd.DataFrame(nyc_stats).transpose()
nyc_stats_df.columns=(['last_income','avg_pct_change', 'pct_change_3yr', 'comparison_3yr', 'pct_change_5yr', 'comparison_5yr'])
nyc_stats_df.index.name = "Neighborhood"
nyc_stats_df

Unnamed: 0_level_0,last_income,avg_pct_change,pct_change_3yr,comparison_3yr,pct_change_5yr,comparison_5yr
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1021,256.526464,0.0762,0.0724,-0.0038,0.0691,-0.0071
1026,259.148148,0.0395,0.0205,-0.0190,0.0253,-0.0142
3004,43.070362,0.0190,0.0193,0.0003,0.0163,-0.0027
3019,89.128119,0.0650,0.0815,0.0165,0.0755,0.0105
AIRPORT JFK,47.046511,0.0078,0.0151,0.0073,0.0162,0.0084
...,...,...,...,...,...,...
WINDSOR TERRACE,80.351665,0.0427,0.0547,0.0120,0.0516,0.0089
WOODHAVEN,41.517217,0.0213,0.0364,0.0151,0.0276,0.0063
WOODLAWN,50.415666,0.0206,0.0256,0.0050,0.0219,0.0013
WOODSIDE,54.656987,0.0294,0.0432,0.0138,0.0385,0.0091


In [173]:
nyc_stats_df.to_csv('Data/Income/income_data.csv')