### Using IPUMS records, calculate in-migration and out-migration to/from NYC - NYC Metro and domestic U.S.

Requires download of csv extracts from IPUMS USA website
https://usa.ipums.org

For more information about specific variables available for download, refer to IPUMS-USA website, ex.:
https://usa.ipums.org/usa-action/variables/MIGRATE1#codes_section

In [1]:
import pandas as pd
import numpy as np
import math

#### Stat functions for using replicate weights

In [2]:
# functions to calculate standard error, moe, and coefficient of variation
def get_se(per_wt,rep_weights):
    result = math.sqrt((sum(map(lambda x: (x-per_wt)**2,rep_weights))/20))
    return result

def get_moe(se):
    return se*1.645 #90% confidence interval

def agg_moe(m):
    result = math.sqrt(sum(map(lambda x: x**2, m)))
    return result

def get_cv(est,se):  #consider deprecating
    return (se/est*100)

def get_cv_2(est,m):
    if est == 0:
        return 0
    else:
        return (np.absolute(m/1.645/est))*100

In [3]:
# create a list of replicate weights
repwt = 'REPWTP'
repwts = [repwt+str(i) for i in range(1, 81)]

#### Geography look up files & cleanup

In [4]:
# pull in geography reference files
df_migpuma = pd.read_csv('data/migpwpuma_xwalk_10.csv')
df_respuma = pd.read_csv('data/respuma_xwalk_10.csv')
df_migpl = pd.read_csv('data/migpl_xwalk.csv')

#df_migpuma.head()

In [5]:
# clean up migpuma table for merge with data table 
df_migpuma['MIGPUMA_str'] = df_migpuma['MIGPUMAID'].apply(str).apply(lambda x: '{0:0>8}'.format(x))
df_migpuma['CountyFIP'] = df_migpuma['CountyFIP'].apply(str)
df_migpuma = df_migpuma.drop(columns=['STATEFIP','State','MIGSTATE','MIGPUMA','PWSTATE','PWPUMA','StringGIS'])
#df_migpuma.head()

In [6]:
df_respuma['puma_id'] = df_respuma['GEOID10'].apply(str).apply(lambda x: '{0:0>8}'.format(x))
df_respuma['CountyFIPS'] = df_respuma['CountyFIPS'].apply(str).apply(lambda x: '{0:0>5}'.format(x))
df_respuma = df_respuma.drop(columns=['StateFIPS','PUMA'])

### In-migrants to NYC boroughs by NYC Metro Subregion & County
Following options available:
- By NYC total or Borough of Current Residence (i.e. destination)
- BY NYC Metro Subregion of residence 1 year ago (i.e. origin)

In [7]:
# read in full inflow table (raw PUMS data)
df = pd.read_csv('data/nyc_inflow_110620.csv')
#df.head()

In [8]:
# create a new id columns for merge with geo lookup tables
df['migpuma_id'] = df['MIGPLAC1'].apply(str).apply(lambda x: '{0:0>3}'.format(x)) + \
                    df['MIGPUMA1'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

df['in_stco'] = df['STATEFIP'].apply(str).apply(lambda x: '{0:0>2}'.format(x)) + \
                   df['COUNTYFIP'].apply(str).apply(lambda x: '{0:0>3}'.format(x))

In [9]:
# REDUCE DATA TABLE TO PEOPLE WHO HAVE MOVED TO NYC COUNTIES ONLY IN THE LAST YEAR

# Select only records for people who have moved in the last year
# MIGRATE1 == 2,3,4 ; other codes are people who haven't moved
mig_codes = [2,3,4]
df = df[df['MIGRATE1'].isin(mig_codes)]


# Select records for people who live in NYC currently
nyc = ['36005','36047','36061','36081','36085']
df = df[df['in_stco'].isin(nyc)]

#df.head()

In [10]:
# merge with geography look up tables 
df_in = df.merge(df_migpl,how='left',left_on='MIGPLAC1',right_on='migplac_id').merge(df_migpuma,how='left',left_on='migpuma_id',right_on='MIGPUMA_str')
#df_in.head()

In [11]:
# replace NaN values in aggregation columns with domestic/intl values
column_clean = ['Subregion5','Subregion7','County','CountyFIP']
for i in column_clean:
    df_in.loc[df_in[i].isnull(),i] = df_in['US_intl']

In [12]:
# reduce table to just counties and subregions of residence 1 year ago
cols = ['in_stco','County','CountyFIP','Subregion7','Subregion5','PERWT'] + repwts
df_in = df_in[cols]

# rename for clarity
df_in = df_in.rename(columns={'County':'out_co_name','CountyFIP':'out_stco',\
                              'Subregion5':'out_subreg_5','Subregion7':'out_subreg_7',\
                              'PERWT':'in_pop'})
    
df_in.head()

Unnamed: 0,in_stco,out_co_name,out_stco,out_subreg_7,out_subreg_5,in_pop,REPWTP1,REPWTP2,REPWTP3,REPWTP4,...,REPWTP71,REPWTP72,REPWTP73,REPWTP74,REPWTP75,REPWTP76,REPWTP77,REPWTP78,REPWTP79,REPWTP80
0,36005,Manhattan,36061,NYC,NYC,32.0,8,33,57,35,...,30,33,34,33,10,33,49,8,30,10
1,36005,intl,intl,intl,intl,24.0,6,23,41,27,...,25,24,24,23,7,25,35,7,25,7
2,36005,Manhattan,36061,NYC,NYC,24.0,6,24,38,25,...,23,24,24,22,8,23,38,7,26,6
3,36005,Manhattan,36061,NYC,NYC,26.0,7,25,42,24,...,26,25,27,24,7,24,38,7,25,7
4,36047,Monmouth,34025,NJ Out,NJ,44.0,79,49,15,12,...,41,64,12,43,13,52,92,43,67,72


#### NYC total by Subregion of Origin (simplified) table

Select different summary columns in groupby function to choose counties or aggregation by borough of current residence.

In [13]:
dff_in = df_in.groupby('out_subreg_5').sum().reset_index()
#dff_in

In [14]:
# calculate standard error, margin of error, cv
# drop replicate weight columns
dff_in['in_se'] = dff_in.apply(lambda x: (get_se(x['in_pop'],x[repwts])),axis=1)
dff_in['in_moe'] = dff_in.apply(lambda x: (get_moe(x['in_se'])),axis=1)
dff_in['in_cv'] = dff_in.apply(lambda x: (get_cv(x['in_pop'],x['in_se'])),axis=1)

dff_in = dff_in.drop(columns=repwts) 
                            
dff_in

Unnamed: 0,out_subreg_5,in_pop,in_se,in_moe,in_cv
0,CT,5414.0,582.69692,958.536433,10.76278
1,HV,12948.0,851.475778,1400.677654,6.576118
2,LI,17496.0,1043.578411,1716.686486,5.964669
3,NJ,20497.0,1039.881652,1710.605317,5.073336
4,NYC,578529.0,6872.004838,11304.447959,1.187841
5,US,106003.0,2085.145439,3430.064246,1.967063
6,intl,89289.0,2485.123277,4088.02779,2.783236


### NYC resident out-migration to NYC Metro and U.S.

##### update csv paths & keep aggregation geography consistent with previous for net flow table in following section

In [15]:
# pull in outflow table
dff = pd.read_csv('data/nyc_outflow_110620.csv')

In [16]:
# create new id columns for merge with geo lookup table
dff['in_respuma'] = dff['STATEFIP'].apply(str).apply(lambda x: '{0:0>3}'.format(x)) + \
                    dff['PUMA'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

dff['out_stco'] = dff['MIGPLAC1'].apply(str).apply(lambda x: '{0:0>2}'.format(x)) + \
                   dff['MIGCOUNTY1'].apply(str).apply(lambda x: '{0:0>3}'.format(x))

In [17]:
# Select records only for people who lived in NYC one year ago (NYC out-migrants)
dff = dff[dff['out_stco'].isin(nyc)]

In [18]:
# merge with geography look up tables 
df_out = dff.merge(df_respuma,how='left',left_on='in_respuma',right_on='puma_id')

In [19]:
# replace NaN values in aggregation columns with US for non region
column_clean = ['NAME','Subregion','Subregion2','County','CountyFIPS']
for i in column_clean:
    df_out.loc[df_out[i].isnull(),i] = 'US'

In [20]:
# reduce table to just current counties and subregions of residence 
cols = ['County','CountyFIPS','Subregion','Subregion2','out_stco','PERWT'] + repwts
df_out = df_out[cols]

# rename for clarity
df_out = df_out.rename(columns={'County':'in_co_name','CountyFIPS':'in_stco',\
                              'Subregion':'in_subreg_7','Subregion2':'in_subreg_5',\
                              'PERWT':'out_pop'})
    
df_out.head()

Unnamed: 0,in_co_name,in_stco,in_subreg_7,in_subreg_5,out_stco,out_pop,REPWTP1,REPWTP2,REPWTP3,REPWTP4,...,REPWTP71,REPWTP72,REPWTP73,REPWTP74,REPWTP75,REPWTP76,REPWTP77,REPWTP78,REPWTP79,REPWTP80
0,US,US,US,US,36047,44.0,29,52,84,56,...,53,65,15,57,54,47,40,10,46,18
1,US,US,US,US,36047,42.0,31,40,63,41,...,43,45,10,85,55,38,77,11,47,20
2,US,US,US,US,36081,13.0,23,5,22,19,...,12,13,21,17,13,13,4,14,14,4
3,US,US,US,US,36081,9.0,10,3,7,12,...,16,2,2,11,7,16,7,9,11,10
4,US,US,US,US,36061,13.0,13,11,20,12,...,26,11,14,19,13,21,4,4,14,12


#### NYC total outflow by Subregion Destination (simplified) table

Select different summary columns in groupby function to choose counties or aggregation by borough of previous residence.

In [21]:
dff_out = df_out.groupby('in_subreg_5').sum().reset_index()
#dff_out

In [22]:
# calculate standard error, margin of error, cv
# drop replicate weight columns
dff_out['out_se'] = dff_out.apply(lambda x: (get_se(x['out_pop'],x[repwts])),axis=1)
dff_out['out_moe'] = dff_out.apply(lambda x: (get_moe(x['out_se'])),axis=1)
dff_out['out_cv'] = dff_out.apply(lambda x: (get_cv(x['out_pop'],x['out_se'])),axis=1)

dff_out = dff_out.drop(columns=repwts) 
                            
dff_out

Unnamed: 0,in_subreg_5,out_pop,out_se,out_moe,out_cv
0,CT,8809.0,755.776753,1243.252758,8.579598
1,HV,29645.0,1669.369013,2746.112026,5.631199
2,LI,30721.0,1434.859436,2360.343772,4.670614
3,NJ,39341.0,1505.038023,2475.787547,3.825622
4,NYC,578529.0,6872.004838,11304.447959,1.187841
5,US,201838.0,4060.715731,6679.877378,2.011869


### Merge in & out to create net flow columns

In [23]:
df_net = pd.merge(dff_in,dff_out,how='left',left_on='out_subreg_5',right_on='in_subreg_5')
df_net = df_net.replace(np.nan,0)

In [24]:
df_net['net_pop'] = df_net['in_pop']-df_net['out_pop']
df_net['net_moe'] = df_net.apply(lambda x: (agg_moe(x[['in_moe','out_moe']])),axis=1)
df_net['net_cv'] = df_net.apply(lambda x: (get_cv_2(x['net_pop'],x['net_moe'])),axis=1)

In [25]:
df_net

Unnamed: 0,out_subreg_5,in_pop,in_se,in_moe,in_cv,in_subreg_5,out_pop,out_se,out_moe,out_cv,net_pop,net_moe,net_cv
0,CT,5414.0,582.69692,958.536433,10.76278,CT,8809.0,755.776753,1243.252758,8.579598,-3395.0,1569.862896,28.109689
1,HV,12948.0,851.475778,1400.677654,6.576118,HV,29645.0,1669.369013,2746.112026,5.631199,-16697.0,3082.698355,11.223458
2,LI,17496.0,1043.578411,1716.686486,5.964669,LI,30721.0,1434.859436,2360.343772,4.670614,-13225.0,2918.601585,13.415697
3,NJ,20497.0,1039.881652,1710.605317,5.073336,NJ,39341.0,1505.038023,2475.787547,3.825622,-18844.0,3009.268106,9.707824
4,NYC,578529.0,6872.004838,11304.447959,1.187841,NYC,578529.0,6872.004838,11304.447959,1.187841,0.0,15986.903619,0.0
5,US,106003.0,2085.145439,3430.064246,1.967063,US,201838.0,4060.715731,6679.877378,2.011869,-95835.0,7509.068019,4.763169
6,intl,89289.0,2485.123277,4088.02779,2.783236,0,0.0,0.0,0.0,0.0,89289.0,4088.02779,2.783236


In [26]:
df_net.to_excel('output/nyc_subregion_migration_1418.xlsx')