In [1]:
#load libraries
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from itertools import cycle, islice
import weightedcalcs as wc

# This line lets us plot on our ipython notebook
%matplotlib inline

# This python notebook parses and summarizes PUMs data
### Four functions:
    
#### 1) filtercolumns: gets relevant columns from pums data, joins with povdata
#### 2) rentersonly: gets renter hh from df from fxn 1
#### 3) calcPov: creates poverty summary table from df from fxn 1 or 2
#### 4) calcBurden: creates cost burden summary table from df from fxn 1 or 2

In [3]:
# read in poverty guidelines 
pov = pd.read_table("proj_data/pov_guidelines.csv", sep=',', low_memory=False)

#get poverty tables for ea year
pov15=pov.iloc[:,[0,1]]
pov10=pov.iloc[:,[0,6]]
pov05=pov.iloc[:,[0,11]]


In [4]:
#load pums data
pums15 = "proj_data/ss15hpa_1yr.csv"
pums10="proj_data/ss10hpa.csv"
pums05="proj_data/ss05hpa.csv"


In [24]:
def filtercolumns(pumsdata, povdata):
    """
    function filtercolumns pulls relevant columns from each year's pums households data (csv)
    and joins with poverty data based on hhd_size, 
    povdata (a DF of just that year's poverty guidelines) 
    returns df of pums data with joined pov data
    """
    pums = pd.read_table(pumsdata,sep=",", low_memory=False)
    df_relevant = pums[['SERIALNO','ST', 'PUMA', 'WGTP', 'NP', 'TEN', 'BLD', 'RNTP', 'VACS','HINCP','SMOCP','GRNTP', 'GRPIP','OCPIP', 'KIT', 'PLM']]
    pums_pov = df_relevant.merge(povdata, left_on='NP', right_on='hhd_size')
    return pums_pov

# test code
fd15=filtercolumns(pums15,pov15)
fd15.head()


Unnamed: 0,SERIALNO,ST,PUMA,WGTP,NP,TEN,BLD,RNTP,VACS,HINCP,SMOCP,GRNTP,GRPIP,OCPIP,KIT,PLM,hhd_size,pov_15
0,26,42,2600,37,1,3.0,2.0,500.0,,45000.0,,603.0,16.0,,1.0,1.0,1,11770.0
1,172,42,3502,0,1,,,,,,,,,,,,1,11770.0
2,206,42,1900,76,1,4.0,3.0,,,19000.0,,,,,1.0,1.0,1,11770.0
3,208,42,2002,80,1,3.0,9.0,290.0,,14900.0,,360.0,29.0,,1.0,1.0,1,11770.0
4,352,42,500,23,1,2.0,2.0,,,21700.0,755.0,,,42.0,1.0,1.0,1,11770.0


In [36]:
def rentersonly(data, yyyy):
    """
    pulls renter records 
    requires two arguments, 
    data is pums df with joined pov data
    yyyy is 4-digit year, eg 2015
    returns renters df
    
    """
    #convert yyyy to pov_yy, eg 2015 to pov_15
    yyyys = str(yyyy)
    yy=yyyys[-2:]
    povyear = "pov" + "_" + yy
    
    #filter renters with complete kitchen and plumbing
    r = data[data['TEN']==3]
    rK = r[r['KIT']==1]
    rKP = rK[rK['PLM']==1]
    return rKP

In [41]:
def calcPov(data, yyyy, exportname=""):
    """
    calculates % of HH below federal poverty line in each PUMA. 
    requires 2 arguments:
    data is pums df with joined pov data (can be df from rentersonly)
    yyyy is 4-digit year, eg 2015
    exportname is export csv name
    if no exportname is provided, returns pct poverty by puma as DF 
    otherwise returns csv
    
    """
    #convert yyyy to pov_yy, eg 2015 to pov_15
    yyyys = str(yyyy)
    yy=yyyys[-2:]
    povyear = "pov" + "_" + str(yy)
    
    #add new column
    data['WGTPpov'] = data['WGTP'].where(data['HINCP']<=data[povyear], 0)
    
    #group by Puma
    grp = data.groupby(["PUMA"])
    
    #summarize hhs below poverty
    povsum = grp[['WGTP','WGTPpov']].sum()
    
    #calc %
    povsum['PCTpov'] = (povsum['WGTPpov'] / povsum['WGTP'])*100
    
    #determine what fxn returns
    if len(exportname)>0:
        povsum.to_csv(exportname)
        print(str(yyyy)+" poverty summary table successfully exported to csv")
    else:
        return povsum
    
#code to test that it worked!
# naming convention: r15_pov
calcPov(rentersonly(fd15,2015),2015,"r15_pov.csv")

2015 poverty summary table successfully exported to csv


In [38]:
def calcBurden(data, yyyy, exportname=""):
    """
    calculates % of HH with cost burden greater than 30% in ea puma
    requires 2 arguments:
    data is pums df with joined pov data (can be df from rentersonly)
    exportname is export csv name
    if no exportname is provided, returns pct with cost burden >30% by puma as DF 
    otherwise returns csv    
    """
    #convert yyyy to pov_yy, eg 2015 to pov_15
    yyyys = str(yyyy)
    yy=yyyys[-2:]
    povyear = "pov" + "_" + str(yy)
    
    #from renters only table, grab only hh below pov line
    need = data[(data['HINCP']<=data[povyear]) & (data['GRPIP'].notnull())]
    
    #add new column
    need['WGTP30'] = need['WGTP'].where(need['GRPIP']>=30.0, 0)
    
    #group by puma
    grp = need.groupby(["PUMA"])
    
    #calculate totals by PUMA of hhds and hhds with burden over 30%
    burdensum = grp[['WGTP','WGTP30']].sum()
    
    #add pct column
    burdensum['PCTburden30'] = (burdensum['WGTP30'] / burdensum['WGTP'])*100
    
    #export or return df
    if len(exportname)>0:
        burdensum.to_csv(exportname)
        print(yyyy+" burden summary table successfully exported to csv")
    else:
        return burdensum


#code to test that it worked!
calcBurden(rentersonly(fd15,15),15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0_level_0,WGTP,WGTP30,PCTburden30
PUMA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101,6917,6031,87.190979
102,2565,2133,83.157895
200,2851,2213,77.621887
300,3474,2613,75.215889
400,2238,1902,84.986595
500,2298,1961,85.335074
600,1655,1655,100.000000
701,4317,3681,85.267547
702,1815,1726,95.096419
801,1995,1776,89.022556


# summary
"""
basically, use filter fxn to filter columns + join pov, then use rentersonly to grab only renter 
households. then calcBurden or calcPov to get summary tables 
NOTE: should be able to adjust calcBurden and calcPov so we can use these fxns for 
owners too
"""

In [11]:
#Results: Run code here when ready to export

#no export, returns dfs
# calcBurden(rentersonly(filtercolumns(pums10, pov10),2010),2010)
# calcBurden(rentersonly(filtercolumns(pums05, pov05),2005),2005)
# calcPov(rentersonly(filtercolumns(pums10, pov10),2010),2010)
# calcPov(rentersonly(filtercolumns(pums05, pov05),2005),2005)

#export to csv
# calcBurden(rentersonly(filtercolumns(pums10, pov10),2010),2010,"r10_totalsburden30.csv")
# calcBurden(rentersonly(filtercolumns(pums05, pov05),2005),2005,"r05_totalsburden30.csv")
# calcPov(rentersonly(filtercolumns(pums10, pov10),2010),2010, "r10_poverty.csv")
# calcPov(rentersonly(filtercolumns(pums05, pov05),2005),2005, "r05_poverty.csv")

burden summary table successfully exported to csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
