### Using IPUMS records, calculate in-migration and out-migration to/from NYC - NYC Metro and domestic U.S.

Requires download of csv extracts from IPUMS USA website
https://usa.ipums.org

For more information about specific variables available for download, refer to IPUMS-USA website, ex.:
https://usa.ipums.org/usa-action/variables/MIGRATE1#codes_section

This notebook updates the previous to pull 4 1-year Estimates for 2016 to 2019

In [1]:
import pandas as pd
import numpy as np
import math

#### Stat functions for using replicate weights

In [2]:
# functions to calculate standard error, moe, and coefficient of variation
def get_se(per_wt,rep_weights):
    result = math.sqrt((sum(map(lambda x: (x-per_wt)**2,rep_weights))/20))
    return result

def get_moe(se):
    return se*1.645 #90% confidence interval

def agg_moe(m):
    result = math.sqrt(sum(map(lambda x: x**2, m)))
    return result

def get_cv(est,m):
    if est == 0:
        return 0
    else:
        return (np.absolute(m/1.645/est))*100

In [3]:
# create a list of replicate weights
repwt = 'REPWTP'
repwts = [repwt+str(i) for i in range(1, 81)]

#### Geography look up files & cleanup

In [4]:
# pull in geography reference files
df_migpuma = pd.read_csv('../data/migpwpuma_xwalk_10.csv')
df_respuma = pd.read_csv('../data/respuma_xwalk_10.csv')
df_migpl = pd.read_csv('../data/migpl_xwalk.csv')

#df_migpuma.head()

In [5]:
# clean up migpuma table for merge with data table 
df_migpuma['MIGPUMA_str'] = df_migpuma['MIGPUMAID'].apply(str).apply(lambda x: '{0:0>8}'.format(x))
df_migpuma['CountyFIP'] = df_migpuma['CountyFIP'].apply(str)
df_migpuma = df_migpuma.drop(columns=['STATEFIP','State','MIGSTATE','MIGPUMA','PWSTATE','PWPUMA','StringGIS'])
#df_migpuma.head()

In [6]:
df_respuma['puma_id'] = df_respuma['GEOID10'].apply(str).apply(lambda x: '{0:0>8}'.format(x))
df_respuma['CountyFIPS'] = df_respuma['CountyFIPS'].apply(str).apply(lambda x: '{0:0>5}'.format(x))
df_respuma = df_respuma.drop(columns=['StateFIPS','PUMA'])

In [7]:
#forgot to pull COUNTYFIPS in pums extract for in-migration
#these are to clean the in-migration table for borough totals
nyc_county = dict(df_respuma[['puma_id','CountyFIPS']].values)
nyc_respuma = df_respuma[df_respuma['Subregion']=='NYC']['puma_id'].tolist()

### In-migrants to NYC boroughs by NYC Metro Subregion & County
Following options available:
- By NYC total or Borough of Current Residence (i.e. destination)
- BY NYC Metro Subregion of residence 1 year ago (i.e. origin)

In [8]:
# read in full inflow table (raw PUMS data)
df = pd.read_csv('../data/mig_nys_in_16191YR_fam.csv')
#df.head()

In [9]:
# create a new id columns for merge with geo lookup tables
df['migpuma_id'] = df['MIGPLAC1'].apply(str).apply(lambda x: '{0:0>3}'.format(x)) + \
                    df['MIGPUMA1'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

df['in_respuma'] = df['STATEFIP'].apply(str).apply(lambda x: '{0:0>3}'.format(x)) + \
                   df['PUMA'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

In [11]:
# REDUCE DATA TABLE TO PEOPLE WHO HAVE MOVED TO NYC COUNTIES ONLY IN THE LAST YEAR
# Select only records for people who have moved in the last year
# MIGRATE1 == 2,3,4 ; other codes are people who haven't moved
mig_codes = [2,3,4]
df = df[df['MIGRATE1'].isin(mig_codes)]

# Select records for people who live in NYC currently
df = df[df['in_respuma'].isin(nyc_respuma)]
df['in_stco'] = df.in_respuma.map(nyc_county)

#Select only movers with a family size of >1
df = df[df['FAMSIZE']>1]

df.head()

Unnamed: 0,YEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,HHTYPE,CLUSTER,STATEFIP,PUMA,STRATA,...,REPWTP74,REPWTP75,REPWTP76,REPWTP77,REPWTP78,REPWTP79,REPWTP80,migpuma_id,in_respuma,in_stco
26,2016,201601,810951,261,150.0,1,2016008109511,36,3902,390236,...,154,165,45,140,44,131,158,3604000,3603902,36085
27,2016,201601,810951,261,150.0,1,2016008109511,36,3902,390236,...,126,166,47,114,39,130,144,3604000,3603902,36085
28,2016,201601,810951,261,150.0,1,2016008109511,36,3902,390236,...,171,152,48,119,45,186,176,3604000,3603902,36085
29,2016,201601,810951,261,150.0,1,2016008109511,36,3902,390236,...,99,116,38,98,31,92,130,3604000,3603902,36085
30,2016,201601,810951,261,150.0,1,2016008109511,36,3902,390236,...,79,93,30,73,23,73,88,3604000,3603902,36085


In [13]:
## create a column that estimates pop per HH
df['famwt'] = df.apply(lambda x:(x['PERWT']*x['FAMSIZE']),axis=1)

In [14]:
# merge with geography look up tables 
df_in = df.merge(df_migpl,how='left',left_on='MIGPLAC1',right_on='migplac_id').merge(df_migpuma,how='left',left_on='migpuma_id',right_on='MIGPUMA_str')
#df_in.head()

In [15]:
# replace NaN values in aggregation columns with domestic/intl values
column_clean = ['Subregion5','Subregion7','Region','County','CountyFIP']
for i in column_clean:
    df_in.loc[df_in[i].isnull(),i] = df_in['US_intl']

In [16]:
# reduce table to just counties and subregions of residence 1 year ago
cols = ['YEAR','in_stco','CountyFIP','Subregion7','Subregion5','Region','PERWT','famwt'] + repwts
df_in = df_in[cols]

# rename for clarity
df_in = df_in.rename(columns={'County':'out_co_name','CountyFIP':'out_stco',\
                              'Subregion5':'out_subreg_5','Subregion7':'out_subreg_7',\
                              'Region':'out_reg','PERWT':'in_pop','famwt':'in_famwt'})
    
df_in.head()

Unnamed: 0,YEAR,in_stco,out_stco,out_subreg_7,out_subreg_5,out_reg,in_pop,in_famwt,REPWTP1,REPWTP2,...,REPWTP71,REPWTP72,REPWTP73,REPWTP74,REPWTP75,REPWTP76,REPWTP77,REPWTP78,REPWTP79,REPWTP80
0,2016,36085,36047,NYC,NYC,NYC,151.0,755.0,258,266,...,35,128,163,154,165,45,140,44,131,158
1,2016,36085,36047,NYC,NYC,NYC,131.0,655.0,224,242,...,35,127,130,126,166,47,114,39,130,144
2,2016,36085,36047,NYC,NYC,NYC,155.0,775.0,255,294,...,39,135,168,171,152,48,119,45,186,176
3,2016,36085,36047,NYC,NYC,NYC,107.0,535.0,200,195,...,25,96,120,99,116,38,98,31,92,130
4,2016,36085,36047,NYC,NYC,NYC,83.0,415.0,130,152,...,22,87,80,79,93,30,73,23,73,88


#### NYC Boroughs by Subregion of Origin (simplified) table

Select different summary columns in groupby function to choose counties or aggregation by borough of current residence.

In [17]:
#CHANGE TO PIVOT TABLE AND THEN RESET INDEX

dff_in = df_in.groupby(['YEAR','in_stco','out_reg']).sum().reset_index()

In [19]:
#calc average family size
dff_in['in_avgfam'] = dff_in.apply(lambda x:(x['in_famwt']/x['in_pop']),axis=1)
dff_in.head()

Unnamed: 0,YEAR,in_stco,out_reg,in_pop,in_famwt,REPWTP1,REPWTP2,REPWTP3,REPWTP4,REPWTP5,...,REPWTP72,REPWTP73,REPWTP74,REPWTP75,REPWTP76,REPWTP77,REPWTP78,REPWTP79,REPWTP80,in_avgfam
0,2016,36005,NYC,71489.0,250301.0,74804,74035,73106,74862,72372,...,71964,67816,67822,74504,68439,68721,67871,74393,72032,3.501252
1,2016,36005,Region,1476.0,4346.0,1260,1218,1611,1489,1204,...,1765,1877,1541,1480,1586,1740,1413,1341,1537,2.944444
2,2016,36005,US,6778.0,23102.0,7825,7326,8454,6213,6114,...,6316,5576,7431,6209,5587,7818,6189,6252,6574,3.40838
3,2016,36005,intl,16594.0,71800.0,17817,18224,13961,17371,18683,...,19732,16395,19231,18836,17413,18588,17209,16480,15652,4.326865
4,2016,36047,NYC,133082.0,502996.0,128923,138217,133265,126744,139539,...,135644,138632,142352,131984,131196,127887,126542,129154,134709,3.779595


In [20]:
# calculate standard error, margin of error, cv
# drop replicate weight columns
dff_in['in_se'] = dff_in.apply(lambda x: (get_se(x['in_pop'],x[repwts])),axis=1)
dff_in['in_moe'] = dff_in.apply(lambda x: (get_moe(x['in_se'])),axis=1)
dff_in['in_cv'] = dff_in.apply(lambda x: (get_cv(x['in_pop'],x['in_se'])),axis=1)

dff_in = dff_in.drop(columns=repwts) 
                            
dff_in

Unnamed: 0,YEAR,in_stco,out_reg,in_pop,in_famwt,in_avgfam,in_se,in_moe,in_cv
0,2016,36005,NYC,71489.0,250301.0,3.501252,7305.959581,12018.303511,6.212582
1,2016,36005,Region,1476.0,4346.0,2.944444,486.799753,800.785594,20.049248
2,2016,36005,US,6778.0,23102.0,3.408380,1445.757777,2378.271543,12.966658
3,2016,36005,intl,16594.0,71800.0,4.326865,2983.417797,4907.722276,10.929419
4,2016,36047,NYC,133082.0,502996.0,3.779595,7620.674294,12536.009214,3.481033
...,...,...,...,...,...,...,...,...,...
75,2019,36081,intl,17648.0,76454.0,4.332162,2164.673070,3560.887200,7.456430
76,2019,36085,NYC,19129.0,79355.0,4.148413,4123.598744,6783.319934,13.104433
77,2019,36085,Region,1027.0,4537.0,4.417722,548.654126,902.536037,32.475983
78,2019,36085,US,456.0,1387.0,3.041667,200.844094,330.388534,26.774929


### NYC resident out-migration to NYC Metro and U.S.

##### update csv paths & keep aggregation geography consistent with previous for net flow table in following section

In [36]:
# pull in outflow table
dff = pd.read_csv('../data/mig_nys_out_16191YR_fam.csv')

In [37]:
nyc_mig = {'03603700':'36005','03603800':'36061','03603900':'36085','03604000':'36047','03604100':'36081'}

In [38]:
# create new id columns for merge with geo lookup table
dff['in_respuma'] = dff['STATEFIP'].apply(str).apply(lambda x: '{0:0>3}'.format(x)) + \
                    dff['PUMA'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

dff['out_migpuma'] = dff['MIGPLAC1'].apply(str).apply(lambda x: '{0:0>3}'.format(x)) + \
                    dff['MIGPUMA1'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

In [39]:
# Select records only for people who lived in NYC one year ago (NYC out-migrants)
dff = dff[dff['out_migpuma'].isin(nyc_mig.keys())]
dff['out_stco'] = dff.out_migpuma.map(nyc_mig)

#Select only movers with a family size of >1
dff = dff[dff['FAMSIZE']>1]

In [40]:
## create a column that estimates pop per HH
dff['famwt'] = dff.apply(lambda x:(x['PERWT']*x['FAMSIZE']),axis=1)

In [41]:
# merge with geography look up tables 
df_out = dff.merge(df_respuma,how='left',left_on='in_respuma',right_on='puma_id')

In [42]:
# replace NaN values in aggregation columns with US for non region
column_clean = ['NAME','Subregion','Subregion2','Region','County','CountyFIPS']
for i in column_clean:
    df_out.loc[df_out[i].isnull(),i] = 'US'

In [43]:
df_out.head()

Unnamed: 0,YEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,HHTYPE,CLUSTER,STATEFIP,PUMA,STRATA,...,out_stco,famwt,GEOID10,NAME,Subregion,Subregion2,Region,County,CountyFIPS,puma_id
0,2016,201601,25826,50701,142.0,3,2016000258261,4,600,60004,...,36047,284.0,,US,US,US,US,US,US,
1,2016,201601,25826,50701,142.0,3,2016000258261,4,600,60004,...,36047,340.0,,US,US,US,US,US,US,
2,2016,201601,40956,836433,118.0,1,2016000409561,4,126,12604,...,36061,228.0,,US,US,US,US,US,US,
3,2016,201601,57046,323315,162.0,9,2016000570461,5,1600,160005,...,36047,324.0,,US,US,US,US,US,US,
4,2016,201601,65681,1291896,76.0,1,2016000656811,5,100,10005,...,36061,152.0,,US,US,US,US,US,US,


In [44]:
# reduce table to just current counties and subregions of residence 
cols = ['YEAR','County','CountyFIPS','Subregion','Subregion2','Region','out_stco','PERWT','famwt'] + repwts
df_out = df_out[cols]

# rename for clarity
df_out = df_out.rename(columns={'County':'in_co_name','CountyFIPS':'in_stco',\
                              'Subregion':'in_subreg_7','Subregion2':'in_subreg_5',\
                              'Region':'in_reg','PERWT':'out_pop','famwt':'out_famwt'})
    
df_out.head()

Unnamed: 0,YEAR,in_co_name,in_stco,in_subreg_7,in_subreg_5,in_reg,out_stco,out_pop,out_famwt,REPWTP1,...,REPWTP71,REPWTP72,REPWTP73,REPWTP74,REPWTP75,REPWTP76,REPWTP77,REPWTP78,REPWTP79,REPWTP80
0,2016,US,US,US,US,US,36047,142.0,284.0,158,...,261,268,140,41,189,262,135,151,249,258
1,2016,US,US,US,US,US,36047,170.0,340.0,227,...,276,271,146,52,191,292,173,139,273,259
2,2016,US,US,US,US,US,36061,76.0,228.0,71,...,106,73,58,141,77,87,139,77,83,85
3,2016,US,US,US,US,US,36047,162.0,324.0,250,...,146,217,133,245,158,280,158,146,314,38
4,2016,US,US,US,US,US,36061,76.0,152.0,117,...,78,150,21,74,67,65,26,80,139,103


#### NYC total outflow by Subregion Destination (simplified) table

Select different summary columns in groupby function to choose counties or aggregation by borough of previous residence.

In [45]:
dff_out = df_out.groupby(['YEAR','out_stco','in_reg']).sum().reset_index()
#dff_out

In [46]:
# calc average family size
dff_out['out_avgfam'] = dff_out.apply(lambda x:(x['out_famwt']/x['out_pop']),axis=1)
dff_out.head()

Unnamed: 0,YEAR,out_stco,in_reg,out_pop,out_famwt,REPWTP1,REPWTP2,REPWTP3,REPWTP4,REPWTP5,...,REPWTP72,REPWTP73,REPWTP74,REPWTP75,REPWTP76,REPWTP77,REPWTP78,REPWTP79,REPWTP80,out_avgfam
0,2016,36005,NYC,65659.0,236049.0,67983,67328,66623,69288,67421,...,66209,61606,62252,68417,64407,61403,62685,68496,68455,3.595075
1,2016,36005,Region,12277.0,51208.0,10779,11502,12024,12753,14973,...,12606,14141,12856,12386,12737,12025,14293,11448,11598,4.171052
2,2016,36005,US,17560.0,68266.0,15818,15751,18486,18770,16582,...,17615,18325,16330,16237,18571,18590,14775,18696,19680,3.887585
3,2016,36047,NYC,137285.0,529479.0,133763,143222,139175,128555,144388,...,140589,144755,144732,136803,132949,133910,129733,132110,142851,3.856787
4,2016,36047,Region,17271.0,71062.0,16924,14307,18623,18468,18688,...,16137,19170,18849,16797,17335,17036,16807,17167,14505,4.114527


In [47]:
# calculate standard error, margin of error, cv
# drop replicate weight columns
dff_out['out_se'] = dff_out.apply(lambda x: (get_se(x['out_pop'],x[repwts])),axis=1)
dff_out['out_moe'] = dff_out.apply(lambda x: (get_moe(x['out_se'])),axis=1)
dff_out['out_cv'] = dff_out.apply(lambda x: (get_cv(x['out_pop'],x['out_se'])),axis=1)

dff_out = dff_out.drop(columns=repwts) 
                            
dff_out.head()

Unnamed: 0,YEAR,out_stco,in_reg,out_pop,out_famwt,out_avgfam,out_se,out_moe,out_cv
0,2016,36005,NYC,65659.0,236049.0,3.595075,6811.75771,11205.341433,6.306654
1,2016,36005,Region,12277.0,51208.0,4.171052,2397.955431,3944.636684,11.873615
2,2016,36005,US,17560.0,68266.0,3.887585,2733.395526,4496.43564,9.462634
3,2016,36047,NYC,137285.0,529479.0,3.856787,7832.446811,12884.375004,3.468235
4,2016,36047,Region,17271.0,71062.0,4.114527,2872.498921,4725.260725,10.11059


## Export in and out table to Excel

In [22]:
dff_in.to_excel('../output/nycmig_in_reg_boroughs_2016-2019_fam.xlsx')

In [48]:
dff_out.to_excel('../output/nycmig_out_reg_boroughs_2016-2019_fam.xlsx')