In [16]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

% matplotlib inline

In [17]:
dfs = {
    "2016":"~/real_estate/data/raw/irs/16zpallagi.csv",
    "2015":"~/real_estate/data/raw/irs/15zpallagi.csv",
    "2014":"~/real_estate/data/raw/irs/14zpallagi.csv",
    "2013":"~/real_estate/data/raw/irs/13zpallagi.csv",
    "2012":"~/real_estate/data/raw/irs/12zpallagi.csv",
    "2011":"~/real_estate/data/raw/irs/11zpallagi.csv",
    "2010":"~/real_estate/data/raw/irs/10zpallagi.csv",
    "2009":"~/real_estate/data/raw/irs/09zpallagi.csv",
    "2008":"~/real_estate/data/raw/irs/08zpallagi.csv"
}

cols = ["STATE","zipcode","NUMDEP","A00100","A18500","A19300","N1","MARS2"]
cols_2011 = ["STATE","ZIPCODE","NUMDEP","A00100","A18500","A19300","N1","MARS2"]
cols_2008 = ["state","ZIPCODE","numdep","a00100","a18500","a19300","n1","mars2"]
new_cols = ["state","zipcode","dependent_count","sum_adjusted_income", 'sum_real_estate_taxes', 
            'sum_mortgage_interest_paid',"tax_return_count","joint_return_count"]

In [18]:
def group_irs_data(filename, cols, new_cols, key, zip_var="zipcode", state_var="STATE"):
    
    df = pd.read_csv(filename, usecols=cols)
    df = df.groupby([zip_var,state_var], as_index=False).agg('sum')
    df = df[cols]
    df.columns = new_cols
    df["year"] = key
    
    return df

In [19]:
# Group dfs
d2016 = group_irs_data(dfs["2016"], cols, new_cols, 2016)
d2015 = group_irs_data(dfs["2015"], cols, new_cols, 2015)
d2014 = group_irs_data(dfs["2014"], cols, new_cols, 2014)
d2013 = group_irs_data(dfs["2013"], cols, new_cols, 2013)
d2012 = group_irs_data(dfs["2012"], cols, new_cols, 2012)
d2011 = group_irs_data(dfs["2011"], cols_2011, new_cols, 2011, zip_var="ZIPCODE")
d2010 = group_irs_data(dfs["2010"], cols, new_cols, 2010)
d2009 = group_irs_data(dfs["2009"], cols, new_cols, 2009)
d2008 = group_irs_data(dfs["2008"], cols_2008, new_cols, 2008, zip_var="ZIPCODE", state_var="state")

# Adjust things in d2008
for i in ["sum_adjusted_income",'sum_real_estate_taxes','sum_mortgage_interest_paid']:
    d2008[i] = d2008[i].values/1000

# Concatenate dfs
dfs = [d2016, d2015, d2014, d2013, d2012, d2011, d2010, d2009, d2008]
df = pd.concat(dfs)

In [20]:
# Create new metrics
df["adult_count"] = df["tax_return_count"].values + df["joint_return_count"].values
df["people_count"] = df["adult_count"].values + df["dependent_count"].values
df["mean_adjusted_income"] = df["sum_adjusted_income"].values / df["adult_count"].values
df["mean_real_estate_taxes_hh"] = df["sum_real_estate_taxes"].values / df["tax_return_count"].values
df["mean_mortgage_interest_hh"] = df["sum_mortgage_interest_paid"].values / df["tax_return_count"].values
df["mean_adjusted_income_with_dep"] = df["sum_adjusted_income"].values / df["people_count"].values

In [21]:
df = df.loc[df["zipcode"] != 0]

In [22]:
df["year"] = df["year"].values + 2

In [23]:
for i in ["dependent_count","sum_adjusted_income","sum_real_estate_taxes",
          "sum_mortgage_interest_paid","tax_return_count","joint_return_count",
          "adult_count","people_count","mean_adjusted_income","mean_real_estate_taxes_hh",
          "mean_mortgage_interest_hh","mean_adjusted_income_with_dep"]:
    df = df.rename(columns={i:i+"_lag"})

In [24]:
df.to_csv("~/real_estate/data/backup/irs_zipcode_data.csv",index=False)