### Using IPUMS records, calculate in-migration and out-migration average family size to/from NYC - NYC Metro and domestic U.S., creating an adjustment factor for USPS individual/family COA records

Requires download of csv extracts from IPUMS USA website
https://usa.ipums.org

For more information about specific variables available for download, refer to IPUMS-USA website, ex.:
https://usa.ipums.org/usa-action/variables/MIGRATE1#codes_section

In [1]:
import pandas as pd
import numpy as np
import math

#### Stat functions for using replicate weights

In [2]:
# functions to calculate standard error, moe, and coefficient of variation
def get_se(var_wt,rep_weights):
    result = math.sqrt((sum(map(lambda x: (x-var_wt)**2,rep_weights))/20))
    return result

def get_moe(se):
    return se*1.645 #90% confidence interval

def agg_moe(m):
    result = math.sqrt(sum(map(lambda x: x**2, m)))
    return result

def get_cv(est,m):
    if est == 0:
        return 0
    else:
        return (np.absolute(m/1.645/est))*100

In [3]:
# create a list of replicate weights - households
repwt = 'REPWT'
repwts = [repwt+str(i) for i in range(1, 81)]

# create a list of replicate weights - people
repwtp = 'REPWTP'
repwtps = [repwtp+str(i) for i in range(1, 81)]

#### Geography look up files & cleanup

In [4]:
# pull in geography reference files
df_migpuma = pd.read_csv('../data/migpwpuma_xwalk_10.csv')
df_respuma = pd.read_csv('../data/respuma_xwalk_10.csv')
df_migpl = pd.read_csv('../data/migpl_xwalk.csv')

#df_migpuma.head()

In [5]:
# clean up migpuma table for merge with data table 
df_migpuma['MIGPUMA_str'] = df_migpuma['MIGPUMAID'].apply(str).apply(lambda x: '{0:0>8}'.format(x))
df_migpuma['CountyFIP'] = df_migpuma['CountyFIP'].apply(str)
df_migpuma = df_migpuma.drop(columns=['STATEFIP','State','MIGSTATE','MIGPUMA','PWSTATE','PWPUMA','StringGIS'])
#df_migpuma.head()

In [6]:
df_respuma['puma_id'] = df_respuma['GEOID10'].apply(str).apply(lambda x: '{0:0>8}'.format(x))
df_respuma['CountyFIPS'] = df_respuma['CountyFIPS'].apply(str).apply(lambda x: '{0:0>5}'.format(x))
df_respuma = df_respuma.drop(columns=['StateFIPS','PUMA'])

In [7]:
#forgot to pull COUNTYFIPS in pums extract for in-migration
#these are to clean the in-migration table for borough totals
nyc_county = dict(df_respuma[['puma_id','CountyFIPS']].values)
nyc_respuma = df_respuma[df_respuma['Subregion']=='NYC']['puma_id'].tolist()

### In-migration to NYC boroughs by NYC, NYC Metro, & U.S.

In [8]:
# read in full inflow table (raw PUMS data)
df = pd.read_csv('../data/mig_nys_in_16191YR_fam.csv')
#df.head()

In [9]:
# create a new id columns for merge with geo lookup tables
df['migpuma_id'] = df['MIGPLAC1'].apply(str).apply(lambda x: '{0:0>3}'.format(x)) + \
                    df['MIGPUMA1'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

df['in_respuma'] = df['STATEFIP'].apply(str).apply(lambda x: '{0:0>3}'.format(x)) + \
                   df['PUMA'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

In [10]:
# REDUCE DATA TABLE TO PEOPLE WHO HAVE MOVED TO NYC COUNTIES ONLY IN THE LAST YEAR
# Select only records for people who have moved in the last year
# MIGRATE1 == 2,3,4 ; other codes are people who haven't moved
mig_codes = [2,3,4]
df = df[df['MIGRATE1'].isin(mig_codes)]

# Select records for people who live in NYC currently
df = df[df['in_respuma'].isin(nyc_respuma)]
df['in_stco'] = df.in_respuma.map(nyc_county)

#Select only movers with a family size of >1
df = df[df['FAMSIZE']>1]

## create a column that estimates total number of families using hh weight
df['fam_id'] = df.SERIAL.astype(str)+'_'+df.FAMUNIT.astype(str)

#df.head()

In [11]:
# merge with geography look up tables 
df_in = df.merge(df_migpl,how='left',left_on='MIGPLAC1',right_on='migplac_id').merge(df_migpuma,how='left',left_on='migpuma_id',right_on='MIGPUMA_str')

In [12]:
# replace NaN values in aggregation columns with domestic/intl values
column_clean = ['Subregion5','Subregion7','Region','County','CountyFIP']
for i in column_clean:
    df_in.loc[df_in[i].isnull(),i] = df_in['US_intl']

In [13]:
# reduce table to just counties and subregions of residence 1 year ago
cols = ['YEAR','in_stco','CountyFIP','Region','PERNUM','PERWT','HHWT','fam_id'] + repwts + repwtps
df_in = df_in[cols]

# rename for clarity
df_in = df_in.rename(columns={'County':'out_co_name','CountyFIP':'out_stco',\
                              'Region':'out_reg','PERWT':'in_pop','HHWT':'in_fam'})


#create id for merge between household & person table at end
df_in['id'] = df_in['YEAR'].astype(str)+df_in['in_stco']+df_in['out_reg']

#### Household table

In [14]:
fam_in = df_in.drop_duplicates(subset=['fam_id']).groupby(['id']).sum().reset_index()
fam_in = fam_in.drop(columns=['YEAR','PERNUM','in_pop']).drop(columns=repwtps)
#fam_in.head()

In [15]:
#Make statistical variables

fam_in['in_fam_se'] = fam_in.apply(lambda x: (get_se(x['in_fam'],x[repwts])),axis=1)
fam_in['in_fam_moe'] = fam_in.apply(lambda x: (get_moe(x['in_fam_se'])),axis=1)
fam_in['in_fam_cv'] = fam_in.apply(lambda x: (get_cv(x['in_fam'],x['in_fam_se'])),axis=1)

#drop replicate weights, clean table for join with person table
fam_in = fam_in.drop(columns=repwts) 

#### Person table

In [16]:
pop_in = df_in.groupby(['id','YEAR','in_stco','out_reg']).sum().reset_index()
pop_in = pop_in.drop(columns=['PERNUM','in_fam']).drop(columns=repwts)
#pop_in.head()

In [17]:
# calculate standard error, margin of error, cv of migpop

pop_in['in_pop_se'] = pop_in.apply(lambda x: (get_se(x['in_pop'],x[repwtps])),axis=1)
pop_in['in_pop_moe'] = pop_in.apply(lambda x: (get_moe(x['in_pop_se'])),axis=1)
pop_in['in_pop_cv'] = pop_in.apply(lambda x: (get_cv(x['in_pop'],x['in_pop_se'])),axis=1)

# drop replicate weight columns
pop_in = pop_in.drop(columns=repwtps) 

#pop_in

In [18]:
#merge person & household tables
dff_in = pd.merge(pop_in,fam_in,how='left',on='id')

In [19]:
#calc average family size
dff_in['in_avgfam'] = dff_in.apply(lambda x:(x['in_pop']/x['in_fam']),axis=1)

In [20]:
dff_in

Unnamed: 0,id,YEAR,in_stco,out_reg,in_pop,in_pop_se,in_pop_moe,in_pop_cv,in_fam,in_fam_se,in_fam_moe,in_fam_cv,in_avgfam
0,201636005NYC,2016,36005,NYC,71489.0,7305.959581,12018.303511,6.212582,30098.0,2702.936986,4446.331342,5.459242,2.375208
1,201636005Region,2016,36005,Region,1476.0,486.799753,800.785594,20.049248,690.0,231.568348,380.929933,20.401599,2.139130
2,201636005US,2016,36005,US,6778.0,1445.757777,2378.271543,12.966658,3811.0,775.718409,1276.056783,12.373690,1.778536
3,201636005intl,2016,36005,intl,16594.0,2983.417797,4907.722276,10.929419,8656.0,1173.427331,1930.287959,8.240870,1.917052
4,201636047NYC,2016,36047,NYC,133082.0,7620.674294,12536.009214,3.481033,54442.0,2403.981229,3954.549122,2.684300,2.444473
...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,201936081intl,2019,36081,intl,17648.0,2164.673070,3560.887200,7.456430,8363.0,1056.710272,1738.288398,7.681180,2.110248
76,201936085NYC,2019,36085,NYC,19129.0,4123.598744,6783.319934,13.104433,6041.0,1205.789555,1983.523817,12.133799,3.166529
77,201936085Region,2019,36085,Region,1027.0,548.654126,902.536037,32.475983,561.0,272.845103,448.830194,29.565648,1.830660
78,201936085US,2019,36085,US,456.0,200.844094,330.388534,26.774929,250.0,105.406831,173.394237,25.630840,1.824000


### NYC resident out-migration to NYC, NYC Metro and U.S.

In [21]:
# pull in outflow table
dff = pd.read_csv('../data/mig_nys_out_16191YR_fam.csv')

In [22]:
nyc_mig = {'03603700':'36005','03603800':'36061','03603900':'36085','03604000':'36047','03604100':'36081'}

In [23]:
# create new id columns for merge with geo lookup table
dff['in_respuma'] = dff['STATEFIP'].apply(str).apply(lambda x: '{0:0>3}'.format(x)) + \
                    dff['PUMA'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

dff['out_migpuma'] = dff['MIGPLAC1'].apply(str).apply(lambda x: '{0:0>3}'.format(x)) + \
                    dff['MIGPUMA1'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

In [24]:
# Select records only for people who lived in NYC one year ago (NYC out-migrants)
dff = dff[dff['out_migpuma'].isin(nyc_mig.keys())]
dff['out_stco'] = dff.out_migpuma.map(nyc_mig)

#Select only movers with a family size of >1
dff = dff[dff['FAMSIZE']>1]

## create a column to estimate total number of families using unique id
dff['fam_id'] = dff.SERIAL.astype(str)+'_'+dff.FAMUNIT.astype(str)

dff.head()

Unnamed: 0,YEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,HHTYPE,REPWT,CLUSTER,STATEFIP,PUMA,...,REPWTP75,REPWTP76,REPWTP77,REPWTP78,REPWTP79,REPWTP80,in_respuma,out_migpuma,out_stco,fam_id
19,2016,201601,25826,50701.0,142,3,1,2016000000000.0,4,600,...,189,262,135,151,249,258,400600,3604000,36047,25826_1
20,2016,201601,25826,50701.0,142,3,1,2016000000000.0,4,600,...,191,292,173,139,273,259,400600,3604000,36047,25826_1
37,2016,201601,40956,836433.0,118,1,1,2016000000000.0,4,126,...,77,87,139,77,83,85,400126,3603800,36061,40956_1
50,2016,201601,57046,323315.0,162,9,1,2016000000000.0,5,1600,...,158,280,158,146,314,38,501600,3604000,36047,57046_1
57,2016,201601,65681,1291896.0,76,1,1,2016000000000.0,5,100,...,67,65,26,80,139,103,500100,3603800,36061,65681_1


In [29]:
# merge with geography look up tables 
df_out = dff.merge(df_respuma,how='left',left_on='in_respuma',right_on='puma_id')

In [30]:
# replace NaN values in aggregation columns with US for non region
column_clean = ['NAME','Subregion','Subregion2','Region','County','CountyFIPS']
for i in column_clean:
    df_out.loc[df_out[i].isnull(),i] = 'US'

In [31]:
# reduce table to just current counties and subregions of residence 
cols = ['YEAR','County','CountyFIPS','Region','out_stco','PERNUM','PERWT','HHWT','fam_id'] + repwts + repwtps
df_out = df_out[cols]

# rename for clarity
df_out = df_out.rename(columns={'County':'in_co_name','CountyFIPS':'in_stco',\
                              'Region':'in_reg','PERWT':'out_pop','HHWT':'out_fam'})
    

#create id for merge between household & person table
df_out['id'] = df_out['YEAR'].astype(str)+df_out['out_stco']+df_out['in_reg']
    
df_out.head()

Unnamed: 0,YEAR,in_co_name,in_stco,in_reg,out_stco,PERNUM,out_pop,out_fam,fam_id,REPWT1,...,REPWTP72,REPWTP73,REPWTP74,REPWTP75,REPWTP76,REPWTP77,REPWTP78,REPWTP79,REPWTP80,id
0,2016,US,US,US,36047,1,142,142,25826_1,158,...,268,140,41,189,262,135,151,249,258,201636047US
1,2016,US,US,US,36047,2,170,142,25826_1,158,...,271,146,52,191,292,173,139,273,259,201636047US
2,2016,US,US,US,36061,2,76,118,40956_1,123,...,73,58,141,77,87,139,77,83,85,201636061US
3,2016,US,US,US,36047,1,162,162,57046_1,251,...,217,133,245,158,280,158,146,314,38,201636047US
4,2016,US,US,US,36061,1,76,76,65681_1,117,...,150,21,74,67,65,26,80,139,103,201636061US


#### Household table

In [32]:
fam_out = df_out.drop_duplicates(subset=['fam_id']).groupby(['id']).sum().reset_index()
fam_out = fam_out.drop(columns=['YEAR','PERNUM','out_pop']).drop(columns=repwtps)
#fam_out.head()

In [33]:
#Make statistical variables

fam_out['out_fam_se'] = fam_out.apply(lambda x: (get_se(x['out_fam'],x[repwts])),axis=1)
fam_out['out_fam_moe'] = fam_out.apply(lambda x: (get_moe(x['out_fam_se'])),axis=1)
fam_out['out_fam_cv'] = fam_out.apply(lambda x: (get_cv(x['out_fam'],x['out_fam_se'])),axis=1)

#drop replicate weights, clean table for joout with person table
fam_out = fam_out.drop(columns=repwts) 

#### Person table

In [34]:
pop_out = df_out.groupby(['id','YEAR','out_stco','in_reg']).sum().reset_index()
pop_out = pop_out.drop(columns=['PERNUM','out_fam']).drop(columns=repwts)
#pop_out.head()

In [35]:
# calculate standard error, margin of error, cv of migpop

pop_out['out_pop_se'] = pop_out.apply(lambda x: (get_se(x['out_pop'],x[repwtps])),axis=1)
pop_out['out_pop_moe'] = pop_out.apply(lambda x: (get_moe(x['out_pop_se'])),axis=1)
pop_out['out_pop_cv'] = pop_out.apply(lambda x: (get_cv(x['out_pop'],x['out_pop_se'])),axis=1)

# drop replicate weight columns
pop_out = pop_out.drop(columns=repwtps) 

#pop_out

In [36]:
#merge person & household tables
dff_out = pd.merge(pop_out,fam_out,how='left',on='id')

In [37]:
#calc average family size
dff_out['out_avgfam'] = dff_out.apply(lambda x:(x['out_pop']/x['out_fam']),axis=1)

In [38]:
dff_out

Unnamed: 0,id,YEAR,out_stco,in_reg,out_pop,out_pop_se,out_pop_moe,out_pop_cv,out_fam,out_fam_se,out_fam_moe,out_fam_cv,out_avgfam
0,201636005NYC,2016,36005,NYC,65659,6811.75771,11205.341433,6.306654,27455,2513.235196,4134.271897,5.564752,2.391513
1,201636005Region,2016,36005,Region,12277,2397.955431,3944.636684,11.873615,4862,824.740717,1356.698479,10.31185,2.525093
2,201636005US,2016,36005,US,17560,2733.395526,4496.43564,9.462634,7597,1155.086793,1900.117775,9.242865,2.311439
3,201636047NYC,2016,36047,NYC,137285,7832.446811,12884.375004,3.468235,59192,2484.338383,4086.73664,2.551419,2.319317
4,201636047Region,2016,36047,Region,17271,2872.498921,4725.260725,10.11059,6923,872.128517,1434.65141,7.658086,2.494728
5,201636047US,2016,36047,US,46992,4854.63169,7985.869131,6.280098,22425,1847.713952,3039.489452,5.008831,2.095518
6,201636061NYC,2016,36061,NYC,78530,6042.501038,9939.914208,4.677515,36760,2437.77058,4010.132604,4.031359,2.136289
7,201636061Region,2016,36061,Region,18804,2244.042279,3691.449549,7.254624,8388,966.722168,1590.257966,7.006116,2.241774
8,201636061US,2016,36061,US,30069,3410.381115,5610.076934,6.894742,17106,1668.147505,2744.102646,5.928162,1.757804
9,201636081NYC,2016,36081,NYC,81374,6030.743988,9920.57386,4.505254,32060,1862.319468,3063.515524,3.53122,2.538178


## Export in and out table to Excel

In [39]:
dff_in.to_excel('../output/nycmig_in_reg_boroughs_2016-2019_fam3.xlsx')

In [40]:
dff_out.to_excel('../output/nycmig_out_reg_boroughs_2016-2019_fam3.xlsx')