In [1]:
import pandas as pd
import math
import numpy as np
import os

In [2]:
import pums

In [3]:
phx_pumas = ['0400112','0400113','0400114','0400115','0400116','0400117',\
             '0400118','0400119','0400120','0400121','0400122','0400123',\
             '0400125','0400128','0400129']

In [4]:
# create a list of replicate weights
repwt = 'REPWT'
repwts = [repwt+str(i) for i in range(1, 81)]

In [5]:
df = pd.read_csv(f'../data/ipums_az_hhinc-size.csv.gz',compression='gzip')
df['GEO_ID'] = df.STATEFIP.astype(str).str.zfill(2) + df.PUMA.astype(str).str.zfill(5)

#phoenix pumas only and get rid of NA for HH  income
df = df[(df['GEO_ID'].isin(phx_pumas))&(df['HHINCOME']!=9999999)&(df['YEAR']!=2010)] #&df.HHINCOME!=9999999

#adjust for inflation (in 1999 dollars), weight income for total households
df['HHINCOME_99'] = df.HHINCOME * df.CPI99
df['INC_WT_99'] = df.HHWT * df.HHINCOME_99
df['INC_WT'] = df.HHWT * df.HHINCOME

#categorize by household size for AMI comparison
df['HHSize'] = df.NUMPREC.apply(lambda x: str(x) if x <= 8 else '9+' )

In [6]:
# new df reduced to necessary variables and group-by for HH size, PUMA, and year
p = df[['GEO_ID','HHWT','INC_WT','HHSize']+repwts].copy().groupby(['GEO_ID','HHSize']).sum().reset_index()

In [7]:
p['AVG_INC'] = p['INC_WT'] / p['HHWT']
p['hh_SE'] = p.apply(lambda x: (pums.get_se(x['HHWT'],x[repwts])),axis=1)
p['hh_MOE'] = p.apply(lambda x: (pums.get_moe(x['hh_SE'])),axis=1)
p['hh_CV'] = p.apply(lambda x: (pums.get_cv(x['HHWT'],x['hh_SE'])),axis=1)

In [8]:
p.head()

Unnamed: 0,GEO_ID,HHSize,HHWT,INC_WT,REPWT1,REPWT2,REPWT3,REPWT4,REPWT5,REPWT6,...,REPWT75,REPWT76,REPWT77,REPWT78,REPWT79,REPWT80,AVG_INC,hh_SE,hh_MOE,hh_CV
0,400112,1,12269.0,819353100.0,12332,12914,13391,11923,12264,12647,...,11844,12444,12211,11191,11760,12023,66782.389763,779.22824,1281.830455,3.860909
1,400112,2,51910.0,8147022000.0,51294,52116,52094,52334,50808,50754,...,53148,53468,52068,50374,50662,53778,156945.138124,1951.618815,3210.412951,2.285483
2,400112,3,30852.0,5593913000.0,29544,27996,32094,29871,26565,29037,...,29319,29616,28851,28779,30867,30087,181314.446713,2646.625606,4353.699122,5.214868
3,400112,4,26248.0,6354897000.0,25956,26244,26180,26264,26204,25996,...,26148,25684,26036,27844,27984,25140,242109.775983,2091.447441,3440.431041,4.843785
4,400112,5,8155.0,1760520000.0,8780,7320,7545,9460,7460,7480,...,8660,6995,8075,7905,5965,8045,215882.33599,1598.991284,2630.340662,11.91945


In [9]:
p = p[['GEO_ID','HHSize','HHWT','AVG_INC','hh_MOE','hh_CV','INC_WT']].copy()

In [10]:
fin = pd.pivot_table(p,values=['AVG_INC','HHWT','hh_MOE','hh_CV','INC_WT'],\
                     index=['GEO_ID'],columns=['HHSize'],aggfunc=np.sum).reset_index()

In [11]:
fin

Unnamed: 0_level_0,GEO_ID,AVG_INC,AVG_INC,AVG_INC,AVG_INC,AVG_INC,AVG_INC,AVG_INC,AVG_INC,AVG_INC,...,hh_CV,hh_MOE,hh_MOE,hh_MOE,hh_MOE,hh_MOE,hh_MOE,hh_MOE,hh_MOE,hh_MOE
HHSize,Unnamed: 1_level_1,1,2,3,4,5,6,7,8,9+,...,9+,1,2,3,4,5,6,7,8,9+
0,400112,66782.389763,156945.138124,181314.446713,242109.775983,215882.33599,213922.878229,86452.542373,,,...,,1281.830455,3210.412951,4353.699122,3440.431041,2630.340662,3381.213728,2295.980987,,
1,400113,65803.2819,124776.364627,222802.549713,172451.807071,192191.951838,270945.948187,,,1049500.0,...,43.198827,1528.50991,3142.168924,3005.422394,3054.217769,1407.695594,2076.121127,,,797.238265
2,400114,49210.214207,95455.010256,105063.05108,109498.619883,105045.855821,146321.495327,84980.377358,,,...,,1589.735864,3533.755368,2445.453094,3474.869747,4005.827484,1165.012297,1516.980523,,
3,400115,34374.038194,86768.313314,82234.027614,100432.395368,137077.288684,118484.12844,72706.953056,45000.0,,...,,1479.524033,2744.859836,3569.824104,4321.495339,3343.539134,3516.440128,3865.597248,740.356735,
4,400116,55529.63516,100312.467423,87988.241835,106025.745426,145198.953537,195480.326797,134429.016189,56500.0,394211.1,...,30.920397,1569.554813,2906.973236,2549.6563,4159.093029,2738.5236,3238.483345,3273.818996,1465.255344,1524.49229
5,400117,64736.375042,131248.386713,174604.034025,153824.117779,238184.904372,96532.945274,96410.655738,,171550.4,...,33.473038,1584.514774,2785.284251,4009.366257,3702.197253,2718.653035,2318.846697,3118.048,,3274.426424
6,400118,40613.903187,88713.121954,99547.346397,56653.255419,99778.494624,87158.121331,73048.119048,,100688.6,...,37.661115,1679.054933,3105.290207,3344.205854,3666.14238,2780.933196,2845.598627,1533.207224,,1966.900006
7,400119,33967.531219,73395.101534,90784.292972,99434.057772,80686.602146,68424.065718,54179.185022,142062.54902,119786.2,...,32.810598,1045.553317,2312.914752,2808.030069,4576.60038,4152.685534,3523.657389,3199.863941,2630.691077,1685.163946
8,400120,74423.492123,133396.680928,131380.539185,143091.434857,179645.399147,154679.172714,103821.722846,149300.0,,...,,1327.46623,2939.579329,3050.63742,4469.87832,3122.24483,3424.721466,1219.766834,612.579977,
9,400121,40733.898009,63535.587119,71604.629727,91035.907199,90049.847888,84021.194825,158225.551684,88361.403509,212132.0,...,23.768656,1166.38281,2937.37844,4733.687525,4610.900903,5398.748341,4225.423255,3862.193484,1473.086983,2497.490323


In [12]:
fin.to_excel('output/pums_inc_2021.xlsx')

In [25]:
test = df.copy().drop_duplicates()
test = test[['GEO_ID','HHWT','INC_WT']]
test = test.groupby(['GEO_ID']).sum().reset_index()

In [26]:
test

Unnamed: 0,GEO_ID,HHWT,INC_WT
0,400112,58257.0,8971028000.0
1,400113,43132.0,5779745000.0
2,400114,47592.0,3910546000.0
3,400115,44062.0,3293003000.0
4,400116,50675.0,4408771000.0
5,400117,48019.0,5656208000.0
6,400118,49399.0,3350234000.0
7,400119,35433.0,2626096000.0
8,400120,42867.0,5243230000.0
9,400121,43773.0,3381347000.0


In [28]:
test.to_excel('output/pums_test.xlsx')