In [35]:
#load libraries
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from itertools import cycle, islice
import weightedcalcs as wc

# This line lets us plot on our ipython notebook
%matplotlib inline

# This python notebook parses and summarizes PUMs data
### Four functions:
    
#### 1) filtercolumns: gets relevant columns from pums data, joins with middle income data
#### 2) rentersonly: gets renter hh from df from fxn 1
#### 3) calcmiddle: creates middle class summary table from df from fxn 1 or 2
#### 4) calcBurden: creates cost burden summary table from df from fxn 1 or 2

In [36]:
# read in middle income guidelines 
mid = pd.read_table("proj_data/midincome_guidelines.csv", sep=',', low_memory=False)
mid.head()

#get poverty tables for ea year
mid15=mid.iloc[:,[0,8,9]]
mid10=mid.iloc[:,[0,5,6]]
mid05=mid.iloc[:,[0,2,3]]

In [37]:
#load pums data
pums15 = "proj_data/ss15hpa_1yr.csv"
pums10="proj_data/ss10hpa.csv"
pums05="proj_data/ss05hpa.csv"


In [38]:
def filtercolumns(pumsdata, middata):
    """
    function filtercolumns pulls relevant columns from each year's pums households data (csv)
    and joins with poverty data based on hhd_size, 
    midincome data (a DF of just that year's middle income guidelines) 
    returns df of pums data with joined middle class data
    """
    pums = pd.read_table(pumsdata,sep=",", low_memory=False)
    df_relevant = pums[['SERIALNO','ST', 'PUMA', 'WGTP', 'NP', 'TEN', 'BLD', 'RNTP', 'VACS','HINCP','SMOCP','GRNTP', 'GRPIP','OCPIP', 'KIT', 'PLM']]
    pums_inc = df_relevant.merge(middata, left_on='NP', right_on='hhd_size')
    return pums_inc

# test code
fd15=filtercolumns(pums15,mid15)
fd15.head()


Unnamed: 0,SERIALNO,ST,PUMA,WGTP,NP,TEN,BLD,RNTP,VACS,HINCP,SMOCP,GRNTP,GRPIP,OCPIP,KIT,PLM,hhd_size,lowerbound15,upperbound15
0,26,42,2600,37,1,3.0,2.0,500.0,,45000.0,,603.0,16.0,,1.0,1.0,1,18525.33,55576.0
1,172,42,3502,0,1,,,,,,,,,,,,1,18525.33,55576.0
2,206,42,1900,76,1,4.0,3.0,,,19000.0,,,,,1.0,1.0,1,18525.33,55576.0
3,208,42,2002,80,1,3.0,9.0,290.0,,14900.0,,360.0,29.0,,1.0,1.0,1,18525.33,55576.0
4,352,42,500,23,1,2.0,2.0,,,21700.0,755.0,,,42.0,1.0,1.0,1,18525.33,55576.0


In [23]:
def rentersonly(data, yyyy):
    """
    pulls renter records 
    requires two arguments, 
    data is pums df with joined mid data
    yyyy is 4-digit year, eg 2015
    returns renters df
    
    """
    #convert yyyy to mid_yy, eg 2015 to mid_15
    yyyys = str(yyyy)
    yy=yyyys[-2:]
    midyearup = "upperbound" + str(yy)
    midyearlow = "lowerbound" + str(yy)
    
    #filter renters with complete kitchen and plumbing
    r = data[data['TEN']==3]
    rK = r[r['KIT']==1]
    rKP = rK[rK['PLM']==1]
    return rKP

In [39]:
def ownersonly(data, yyyy):
    """
    pulls homeowners records 
    requires two arguments, 
    data is pums df with joined mid data
    yyyy is 4-digit year, eg 2015
    returns owners df
    
    """
    #convert yyyy to mid_yy, eg 2015 to mid_15
    yyyys = str(yyyy)
    yy=yyyys[-2:]
    midyearup = "upperbound" + str(yy)
    midyearlow = "lowerbound" + str(yy)
    
    #filter homeowners with complete kitchen and plumbing
    h = data[(data['TEN']==1) | (data['TEN']==2)] 
    hK = h[h['KIT']==1]
    hKP = hK[hK['PLM']==1]
    return hKP

In [40]:
def calcMid(data, yyyy, exportname=""):
    """
    calculates % of HH below federal poverty line in each PUMA. 
    requires 2 arguments:
    data is pums df with joined mid data (can be df from rentersonly or ownersonly)
    yyyy is 4-digit year, eg 2015
    exportname is export csv name
    if no exportname is provided, returns pct middleclass by puma as DF 
    otherwise returns csv
    
    """
    #convert yyyy to mid_yy, eg 2015 to mid_15
    yyyys = str(yyyy)
    yy=yyyys[-2:]
    midyearup = "upperbound" + str(yy)
    midyearlow = "lowerbound" + str(yy)
    
    #add new column
    data['WGTPmid'] = data['WGTP'].where((data[midyearlow]>=data['HINCP']) | (data['HINCP']<=data[midyearup]), 0)
    
    #group by Puma
    grp = data.groupby(["PUMA"])
    
    #summarize hhs below poverty
    midsum = grp[['WGTP','WGTPmid']].sum()
    
    #calc %
    midsum['PCTmid'] = (midsum['WGTPmid'] / midsum['WGTP'])*100
    
    #determine what fxn returns
    if len(exportname)>0:
        midsum.to_csv(exportname)
        print(str(yyyy)+" middle class summary table successfully exported to csv")
    else:
        return midsum
    
#code to test that it worked!
# naming convention: r15_poverty
#calcMid(rentersonly(fd15,2015),2015)

In [41]:
def RcalcBurden(data, yyyy, exportname=""):
    """
    calculates % of HH with cost burden greater than 30% in ea puma for RENTERS
    requires 2 arguments:
    data is pums df with joined mid data (can be df from rentersonly)
    exportname is export csv name
    if no exportname is provided, returns pct with cost burden >30% by puma as DF 
    otherwise returns csv    
    """
    #convert yyyy to mid_yy, eg 2015 to mid_15
    yyyys = str(yyyy)
    yy=yyyys[-2:]
    midyearup = "upperbound" + str(yy)
    midyearlow = "lowerbound" + str(yy)
    
    #from renters only table, grab only hh in middle class
    need = data[((data[midyearlow]>=data['HINCP']) | (data['HINCP']<=data[midyearup]) & (data['GRPIP'].notnull()))]
    
    #add new column for 30% burden
    need['WGTP30'] = need['WGTP'].where(need['GRPIP']>=30.0, 0)
    
    #add new column for 50% burden
    need['WGTP50'] = need['WGTP'].where(need['GRPIP']>=50.0, 0)
    
    #group by puma
    grp = need.groupby(["PUMA"])
    
    #calculate totals by PUMA of hhds and hhds with burden over 30% and 50%
    burdensum = grp[['WGTP','WGTP30','WGTP50']].sum()
    
    #add pct column for 30%
    burdensum['PCTburden30'] = (burdensum['WGTP30'] / burdensum['WGTP'])*100
    
    # add pct column for 50%
    burdensum['PCTburden50'] = (burdensum['WGTP50'] / burdensum['WGTP'])*100
    
    #export or return df
    if len(exportname)>0:
        burdensum.to_csv(exportname)
        print(str(yyyy)+" burden summary table successfully exported to csv")
    else:
        return burdensum


#code to test that it worked!
#RcalcBurden(rentersonly(fd15,15),15)

In [42]:
def HcalcBurden(data, yyyy, exportname=""):
    """
    calculates % of HH with cost burden greater than 30% in ea puma for OWNERS
    requires 2 arguments:
    data is pums df with joined mid data (can be df from ownersonly)
    exportname is export csv name
    if no exportname is provided, returns pct with cost burden >30% by puma as DF 
    otherwise returns csv    
    """
    #convert yyyy to pov_yy, eg 2015 to mid_15
    yyyys = str(yyyy)
    yy=yyyys[-2:]
    midyearup = "upperbound" + str(yy)
    midyearlow = "lowerbound" + str(yy)
    
    #from owners only table, grab only hh in middle class
    need = data[((data[midyearlow]>=data['HINCP']) | (data['HINCP']<=data[midyearup]) & (data['OCPIP'].notnull()))]
    
    #add new column for 30% burden
    need['WGTP30'] = need['WGTP'].where(need['OCPIP']>=30.0, 0)
    
     #add new column for 50% burden
    need['WGTP50'] = need['WGTP'].where(need['OCPIP']>=50.0, 0)
    
    #group by puma
    grp = need.groupby(["PUMA"])
    
    #calculate totals by PUMA of hhds and hhds with burden over 30% and 50%
    burdensum = grp[['WGTP','WGTP30','WGTP50']].sum()
    
    #add pct column for 30%
    burdensum['PCTburden30'] = (burdensum['WGTP30'] / burdensum['WGTP'])*100
    
    # add pct column for 50%
    burdensum['PCTburden50'] = (burdensum['WGTP50'] / burdensum['WGTP'])*100
    
    #export or return df
    if len(exportname)>0:
        burdensum.to_csv(exportname)
        print(str(yyyy)+" burden summary table successfully exported to csv")
    else:
        return burdensum


#code to test that it worked!
#HcalcBurden(ownersonly(fd15,15),15)

In [43]:
## RUN FUNCTIONS TO PRODUCE RENTER CSVs FOR ALL YEARS
RcalcBurden(rentersonly(filtercolumns(pums15, mid15),2015),2015,"r15_burdenM.csv")
RcalcBurden(rentersonly(filtercolumns(pums10, mid10),2010),2010,"r10_burdenM.csv")
RcalcBurden(rentersonly(filtercolumns(pums05, mid05),2005),2005,"r05_burdenM.csv")
calcMid(rentersonly(filtercolumns(pums15, mid15),2015),2015, "r15_middle.csv")
calcMid(rentersonly(filtercolumns(pums10, mid10),2010),2010, "r10_middle.csv")
calcMid(rentersonly(filtercolumns(pums05, mid05),2005),2005, "r05_middle.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


2015 burden summary table successfully exported to csv
2010 burden summary table successfully exported to csv
2005 burden summary table successfully exported to csv
2015 middle class summary table successfully exported to csv
2010 middle class summary table successfully exported to csv
2005 middle class summary table successfully exported to csv


In [44]:
## RUN FUNCTIONS TO PRODUCE HOMEOWNER CSVs FOR ALL YEARS
HcalcBurden(ownersonly(filtercolumns(pums15, mid15),2015),2015,"h15_burdenM.csv")
HcalcBurden(ownersonly(filtercolumns(pums10, mid10),2010),2010,"h10_burdenM.csv")
HcalcBurden(ownersonly(filtercolumns(pums05, mid05),2005),2005,"h05_burdenM.csv")
calcMid(ownersonly(filtercolumns(pums15, mid15),2015),2015, "h15_middle.csv")
calcMid(ownersonly(filtercolumns(pums10, mid10),2010),2010, "h10_middle.csv")
calcMid(ownersonly(filtercolumns(pums05, mid05),2005),2005, "h05_middle.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


2015 burden summary table successfully exported to csv
2010 burden summary table successfully exported to csv
2005 burden summary table successfully exported to csv
2015 middle class summary table successfully exported to csv
2010 middle class summary table successfully exported to csv
2005 middle class summary table successfully exported to csv


# summary
"""
basically, use filter fxn to filter columns + join pov, then use rentersonly to grab only renter 
households. then calcBurden or calcPov to get summary tables 
NOTE: should be able to adjust calcBurden and calcPov so we can use these fxns for 
owners too
"""

In [11]:
#Results: Run code here when ready to export

#no export, returns dfs
# calcBurden(rentersonly(filtercolumns(pums10, pov10),2010),2010)
# calcBurden(rentersonly(filtercolumns(pums05, pov05),2005),2005)
# calcPov(rentersonly(filtercolumns(pums10, pov10),2010),2010)
# calcPov(rentersonly(filtercolumns(pums05, pov05),2005),2005)

#export to csv
# calcBurden(rentersonly(filtercolumns(pums10, pov10),2010),2010,"r10_totalsburden30.csv")
# calcBurden(rentersonly(filtercolumns(pums05, pov05),2005),2005,"r05_totalsburden30.csv")
# calcPov(rentersonly(filtercolumns(pums10, pov10),2010),2010, "r10_poverty.csv")
# calcPov(rentersonly(filtercolumns(pums05, pov05),2005),2005, "r05_poverty.csv")

burden summary table successfully exported to csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
