In [1]:
#load libraries
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from itertools import cycle, islice
import weightedcalcs as wc

# This line lets us plot on our ipython notebook
%matplotlib inline

In [2]:
pov = pd.read_table("proj_data/pov_guidelines.csv", sep=',', low_memory=False)

pov15=pov.iloc[:,[0,1]]
pov10=pov.iloc[:,[0,6]]
pov05=pov.iloc[:,[0,11]]


In [3]:
def filtercolumns(pumsdata, povdata):
    """
    function filtercolumns pulls relevant columns from pums households data (csv)
    and joins with poverty data based on hhd_size, povdata 
    (a df of just that year's poverty guidelines) 
    returns df of pums data with joined pov data
    """
    pums = pd.read_table(pumsdata,sep=",", low_memory=False)
    df_relevant = pums[['SERIALNO','ST', 'PUMA', 'ADJHSG', 'WGTP', 'NP', 'TEN', 'BLD', 'BDSP', 'RNTP', 'VACS','HINCP','SMOCP','GRNTP', 'GRPIP','OCPIP', 'KIT', 'PLM']]
    pums_pov = df_relevant.merge(povdata, left_on='NP', right_on='hhd_size')
    return pums_pov

# test code
# pums15 = "proj_data/ss15hpa_1yr.csv"
# fd15=filtercolumns(pums15,pov15)



In [4]:
def calcRpov(data, yy, exportname=""):
    """
    calculates % of renting HH below federal poverty line in each 
    PUMA. 
    requires three arguments, 
    data is pums df with joined pov data
    yy is 2-digit year, eg 15 for 2015
    exportname is export csv name
    returns pct poverty by puma as DF and CSV
    
    """
    povyear = "pov" + "_" + str(yy)
    r = data[data['TEN']==3]
#     print("r_ten:", len(r.index))
    rK = r[r['KIT']==1]
    rKP = rK[rK['PLM']==1]
#     print("r_kit,plm", len(rKP.index))
    rKP['WGTPpov'] = rKP['WGTP'].where(rKP['HINCP']<=rKP[povyear], 0)
    #group by Puma
    grp = rKP.groupby(["PUMA"])
    povsum = grp[['WGTP','WGTPpov']].sum()
    povsum['PCTpov'] = (povsum['WGTPpov'] / povsum['WGTP'])*100
#     rNeed = rKP[(rKP['HINCP']<=rKP[povyear]) & (rKP['GRPIP'].notnull())]
#     print("rbelow", len(rNeed.index))
    if len(exportname)>0:
        povsum.to_csv(exportname)
    else:
        return povsum




In [6]:
def calcBurden(data,povyear):
    povyear = "pov" + "_" + str(yy)
    r = data[data['TEN']==3]
#     print("r_ten:", len(r.index))
    rK = r[r['KIT']==1]
    rKP = rK[rK['PLM']==1]
    
    # FILTER for records where hincp is less than poverty guideline
    need = rKP[(rKP['HINCP']<=rKP[povyear]) & (rKP['GRPIP'].notnull())]
    need['WGTP30'] = need['WGTP'].where(need['GRPIP']>=30.0, 0)
    return need

pums10="proj_data/ss10hpa.csv"

calcRpov(filtercolumns(pums10, pov10), 10)

# calcBurden(calcRpov(filtercolumns(pums10, pov10), 10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0_level_0,WGTP,WGTPpov,PCTpov
PUMA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,19435,8528,43.879599
200,13939,3174,22.770644
300,12698,4195,33.036699
400,7770,2063,26.550837
500,7961,2389,30.008793
600,10358,2843,27.447384
700,9290,2525,27.179763
801,18307,3748,20.473043
802,9587,1751,18.264316
901,16799,6505,38.722543


In [None]:

calcRpov(filtercolumns(pums10, pov10), 10,'r10_poverty.csv')