### Using IPUMS records, calculate in-migration and out-migration to/from NYC - NYC Metro and domestic U.S.

Requires download of csv extracts from IPUMS USA website
https://usa.ipums.org

For more information about specific variables available for download, refer to IPUMS-USA website, ex.:
https://usa.ipums.org/usa-action/variables/MIGRATE1#codes_section

This notebook updates the previous to pull 4 1-year Estimates for 2016 to 2019

In [28]:
import pandas as pd
import numpy as np
import math

#### Stat functions for using replicate weights

In [29]:
# functions to calculate standard error, moe, and coefficient of variation
def get_se(per_wt,rep_weights):
    result = math.sqrt((sum(map(lambda x: (x-per_wt)**2,rep_weights))/20))
    return result

def get_moe(se):
    return se*1.645 #90% confidence interval

def agg_moe(m):
    result = math.sqrt(sum(map(lambda x: x**2, m)))
    return result

def get_cv(est,m):
    if est == 0:
        return 0
    else:
        return (np.absolute(m/1.645/est))*100

In [30]:
# create a list of replicate weights
repwt = 'REPWTP'
repwts = [repwt+str(i) for i in range(1, 81)]

#### Geography look up files & cleanup

In [31]:
# pull in geography reference files
df_migpuma = pd.read_csv('data/migpwpuma_xwalk_10.csv')
df_respuma = pd.read_csv('data/respuma_xwalk_10.csv')
df_migpl = pd.read_csv('data/migpl_xwalk.csv')

#df_migpuma.head()

In [32]:
# clean up migpuma table for merge with data table 
df_migpuma['MIGPUMA_str'] = df_migpuma['MIGPUMAID'].apply(str).apply(lambda x: '{0:0>8}'.format(x))
df_migpuma['CountyFIP'] = df_migpuma['CountyFIP'].apply(str)
df_migpuma = df_migpuma.drop(columns=['STATEFIP','State','MIGSTATE','MIGPUMA','PWSTATE','PWPUMA','StringGIS'])
#df_migpuma.head()

In [33]:
df_respuma['puma_id'] = df_respuma['GEOID10'].apply(str).apply(lambda x: '{0:0>8}'.format(x))
df_respuma['CountyFIPS'] = df_respuma['CountyFIPS'].apply(str).apply(lambda x: '{0:0>5}'.format(x))
df_respuma = df_respuma.drop(columns=['StateFIPS','PUMA'])

In [34]:
#forgot to pull COUNTYFIPS in pums extract for in-migration
#these are to clean the in-migration table for borough totals
nyc_county = dict(df_respuma[['puma_id','CountyFIPS']].values)
nyc_respuma = df_respuma[df_respuma['Subregion']=='NYC']['puma_id'].tolist()

### In-migrants to NYC boroughs by NYC Metro Subregion & County
Following options available:
- By NYC total or Borough of Current Residence (i.e. destination)
- BY NYC Metro Subregion of residence 1 year ago (i.e. origin)

In [35]:
# read in full inflow table (raw PUMS data)
df = pd.read_csv('data/mig_nys_in_16191YR.csv')
#df.head()

In [36]:
# create a new id columns for merge with geo lookup tables
df['migpuma_id'] = df['MIGPLAC1'].apply(str).apply(lambda x: '{0:0>3}'.format(x)) + \
                    df['MIGPUMA1'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

df['in_respuma'] = df['STATEFIP'].apply(str).apply(lambda x: '{0:0>3}'.format(x)) + \
                   df['PUMA'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

In [37]:
# REDUCE DATA TABLE TO PEOPLE WHO HAVE MOVED TO NYC COUNTIES ONLY IN THE LAST YEAR
# Select only records for people who have moved in the last year
# MIGRATE1 == 2,3,4 ; other codes are people who haven't moved
mig_codes = [2,3,4]
df = df[df['MIGRATE1'].isin(mig_codes)]

# Select records for people who live in NYC currently
df = df[df['in_respuma'].isin(nyc_respuma)]
df['in_stco'] = df.in_respuma.map(nyc_county)
df.head()

Unnamed: 0,YEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,CLUSTER,STATEFIP,PUMA,STRATA,GQ,...,REPWTP74,REPWTP75,REPWTP76,REPWTP77,REPWTP78,REPWTP79,REPWTP80,migpuma_id,in_respuma,in_stco
21,2016,201601,810947,210,38.0,2016008109471,36,3808,380836,1,...,66,38,43,41,74,38,10,15000001,3603808,36061
26,2016,201601,810951,261,150.0,2016008109511,36,3902,390236,1,...,154,165,45,140,44,131,158,3604000,3603902,36085
27,2016,201601,810951,261,150.0,2016008109511,36,3902,390236,1,...,126,166,47,114,39,130,144,3604000,3603902,36085
28,2016,201601,810951,261,150.0,2016008109511,36,3902,390236,1,...,171,152,48,119,45,186,176,3604000,3603902,36085
29,2016,201601,810951,261,150.0,2016008109511,36,3902,390236,1,...,99,116,38,98,31,92,130,3604000,3603902,36085


In [38]:
# merge with geography look up tables 
df_in = df.merge(df_migpl,how='left',left_on='MIGPLAC1',right_on='migplac_id').merge(df_migpuma,how='left',left_on='migpuma_id',right_on='MIGPUMA_str')
#df_in.head()

In [39]:
# replace NaN values in aggregation columns with domestic/intl values
column_clean = ['Subregion5','Subregion7','Region','County','CountyFIP']
for i in column_clean:
    df_in.loc[df_in[i].isnull(),i] = df_in['US_intl']

In [40]:
# reduce table to just counties and subregions of residence 1 year ago
cols = ['YEAR','in_stco','CountyFIP','Subregion7','Subregion5','Region','PERWT'] + repwts
df_in = df_in[cols]

# rename for clarity
df_in = df_in.rename(columns={'County':'out_co_name','CountyFIP':'out_stco',\
                              'Subregion5':'out_subreg_5','Subregion7':'out_subreg_7',\
                              'Region':'out_reg','PERWT':'in_pop'})
    
df_in.head()

Unnamed: 0,YEAR,in_stco,out_stco,out_subreg_7,out_subreg_5,out_reg,in_pop,REPWTP1,REPWTP2,REPWTP3,...,REPWTP71,REPWTP72,REPWTP73,REPWTP74,REPWTP75,REPWTP76,REPWTP77,REPWTP78,REPWTP79,REPWTP80
0,2016,36061,intl,intl,intl,intl,37.0,30,10,37,...,13,34,8,66,38,43,41,74,38,10
1,2016,36085,36047,NYC,NYC,NYC,151.0,258,266,243,...,35,128,163,154,165,45,140,44,131,158
2,2016,36085,36047,NYC,NYC,NYC,131.0,224,242,208,...,35,127,130,126,166,47,114,39,130,144
3,2016,36085,36047,NYC,NYC,NYC,155.0,255,294,257,...,39,135,168,171,152,48,119,45,186,176
4,2016,36085,36047,NYC,NYC,NYC,107.0,200,195,164,...,25,96,120,99,116,38,98,31,92,130


#### NYC Boroughs by Subregion of Origin (simplified) table

Select different summary columns in groupby function to choose counties or aggregation by borough of current residence.

In [41]:
#CHANGE TO PIVOT TABLE AND THEN RESET INDEX

dff_in = df_in.groupby(['YEAR','in_stco','out_reg']).sum().reset_index()

In [42]:
# calculate standard error, margin of error, cv
# drop replicate weight columns
dff_in['in_se'] = dff_in.apply(lambda x: (get_se(x['in_pop'],x[repwts])),axis=1)
dff_in['in_moe'] = dff_in.apply(lambda x: (get_moe(x['in_se'])),axis=1)
dff_in['in_cv'] = dff_in.apply(lambda x: (get_cv(x['in_pop'],x['in_se'])),axis=1)

dff_in = dff_in.drop(columns=repwts) 
                            
dff_in

Unnamed: 0,YEAR,in_stco,out_reg,in_pop,in_se,in_moe,in_cv
0,2016,36005,NYC,105669.0,7723.432132,12705.045857,4.443210
1,2016,36005,Region,3990.0,822.586743,1353.155192,12.532650
2,2016,36005,US,9204.0,1513.278527,2489.343177,9.994852
3,2016,36005,intl,19071.0,3090.397450,5083.703806,9.850879
4,2016,36047,NYC,192499.0,8521.382452,14017.674134,2.691012
...,...,...,...,...,...,...,...
75,2019,36081,intl,20273.0,2290.606012,3768.046889,6.868572
76,2019,36085,NYC,23637.0,4196.868922,6903.849377,10.793621
77,2019,36085,Region,1284.0,566.293961,931.553565,26.810876
78,2019,36085,US,897.0,315.228964,518.551646,21.363272


### NYC resident out-migration to NYC Metro and U.S.

##### update csv paths & keep aggregation geography consistent with previous for net flow table in following section

In [43]:
# pull in outflow table
dff = pd.read_csv('data/mig_nys_out_16191YR.csv')

In [44]:
nyc_mig = {'03603700':'36005','03603800':'36061','03603900':'36085','03604000':'36047','03604100':'36081'}

In [45]:
# create new id columns for merge with geo lookup table
dff['in_respuma'] = dff['STATEFIP'].apply(str).apply(lambda x: '{0:0>3}'.format(x)) + \
                    dff['PUMA'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

dff['out_migpuma'] = dff['MIGPLAC1'].apply(str).apply(lambda x: '{0:0>3}'.format(x)) + \
                    dff['MIGPUMA1'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

In [46]:
# Select records only for people who lived in NYC one year ago (NYC out-migrants)
dff = dff[dff['out_migpuma'].isin(nyc_mig.keys())]
dff['out_stco'] = dff.out_migpuma.map(nyc_mig)

In [47]:
# merge with geography look up tables 
df_out = dff.merge(df_respuma,how='left',left_on='in_respuma',right_on='puma_id')

In [48]:
# replace NaN values in aggregation columns with US for non region
column_clean = ['NAME','Subregion','Subregion2','Region','County','CountyFIPS']
for i in column_clean:
    df_out.loc[df_out[i].isnull(),i] = 'US'

In [49]:
df_out.head()

Unnamed: 0,YEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,CLUSTER,STATEFIP,PUMA,STRATA,GQ,...,out_migpuma,out_stco,GEOID10,NAME,Subregion,Subregion2,Region,County,CountyFIPS,puma_id
0,2016,201601,10665,737324,187.0,2016000106651,1,2500,250001,1,...,3604100,36081,,US,US,US,US,US,US,
1,2016,201601,11058,762866,94.0,2016000110581,1,2702,270201,1,...,3604000,36047,,US,US,US,US,US,US,
2,2016,201601,15911,1092556,85.0,2016000159111,1,1700,170001,1,...,3604100,36081,,US,US,US,US,US,US,
3,2016,201601,25826,50701,142.0,2016000258261,4,600,60004,1,...,3604000,36047,,US,US,US,US,US,US,
4,2016,201601,25826,50701,142.0,2016000258261,4,600,60004,1,...,3604000,36047,,US,US,US,US,US,US,


In [50]:
# reduce table to just current counties and subregions of residence 
cols = ['YEAR','County','CountyFIPS','Subregion','Subregion2','Region','out_stco','PERWT'] + repwts
df_out = df_out[cols]

# rename for clarity
df_out = df_out.rename(columns={'County':'in_co_name','CountyFIPS':'in_stco',\
                              'Subregion':'in_subreg_7','Subregion2':'in_subreg_5',\
                              'Region':'in_reg','PERWT':'out_pop'})
    
df_out.head()

Unnamed: 0,YEAR,in_co_name,in_stco,in_subreg_7,in_subreg_5,in_reg,out_stco,out_pop,REPWTP1,REPWTP2,...,REPWTP71,REPWTP72,REPWTP73,REPWTP74,REPWTP75,REPWTP76,REPWTP77,REPWTP78,REPWTP79,REPWTP80
0,2016,US,US,US,US,US,36081,455.0,470,527,...,529,488,411,417,456,423,200,241,253,536
1,2016,US,US,US,US,US,36047,94.0,92,88,...,144,119,31,21,116,119,36,157,179,121
2,2016,US,US,US,US,US,36081,85.0,143,66,...,143,148,102,86,75,117,109,36,32,24
3,2016,US,US,US,US,US,36047,142.0,158,227,...,261,268,140,41,189,262,135,151,249,258
4,2016,US,US,US,US,US,36047,170.0,227,255,...,276,271,146,52,191,292,173,139,273,259


#### NYC total outflow by Subregion Destination (simplified) table

Select different summary columns in groupby function to choose counties or aggregation by borough of previous residence.

In [51]:
dff_out = df_out.groupby(['YEAR','out_stco','in_reg']).sum().reset_index()
#dff_out

In [52]:
# calculate standard error, margin of error, cv
# drop replicate weight columns
dff_out['out_se'] = dff_out.apply(lambda x: (get_se(x['out_pop'],x[repwts])),axis=1)
dff_out['out_moe'] = dff_out.apply(lambda x: (get_moe(x['out_se'])),axis=1)
dff_out['out_cv'] = dff_out.apply(lambda x: (get_cv(x['out_pop'],x['out_se'])),axis=1)

dff_out = dff_out.drop(columns=repwts) 
                            
dff_out.head()

Unnamed: 0,YEAR,out_stco,in_reg,out_pop,out_se,out_moe,out_cv
0,2016,36005,NYC,95596.0,7458.159213,12268.671906,4.742704
1,2016,36005,Region,16199.0,2514.671559,4136.634715,9.436852
2,2016,36005,US,25017.0,2895.628507,4763.308894,7.036257
3,2016,36047,NYC,197885.0,8499.478613,13981.642319,2.61104
4,2016,36047,Region,23111.0,3026.133903,4977.990271,7.959825


## Export in and out table to Excel

In [53]:
dff_in.to_excel('output/nycmig_in_reg_boroughs_2016-2019.xlsx')

In [54]:
dff_out.to_excel('output/nycmig_out_reg_boroughs_2016-2019.xlsx')

## Pivot & resave

In [60]:
dfff_in = pd.pivot_table(dff_in,values=['in_pop','in_moe','in_cv'],index='in_stco',\
                         columns=['YEAR','out_reg'],aggfunc=np.sum)

In [61]:
dfff_in

Unnamed: 0_level_0,in_cv,in_cv,in_cv,in_cv,in_cv,in_cv,in_cv,in_cv,in_cv,in_cv,...,in_pop,in_pop,in_pop,in_pop,in_pop,in_pop,in_pop,in_pop,in_pop,in_pop
YEAR,2016,2016,2016,2016,2017,2017,2017,2017,2018,2018,...,2017,2017,2018,2018,2018,2018,2019,2019,2019,2019
out_reg,NYC,Region,US,intl,NYC,Region,US,intl,NYC,Region,...,US,intl,NYC,Region,US,intl,NYC,Region,US,intl
in_stco,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
36005,4.44321,12.53265,9.994852,9.850879,3.428217,12.403869,10.641277,8.039764,5.10289,11.458385,...,8699.0,14353.0,86209.0,8514.0,9437.0,10002.0,100058.0,4856.0,9882.0,11309.0
36047,2.691012,8.054855,5.061586,5.688594,2.382853,7.249046,5.911995,6.2172,2.485581,9.408744,...,28546.0,22760.0,195116.0,9722.0,23910.0,18287.0,199291.0,12452.0,28936.0,19029.0
36061,3.380807,6.440772,4.67619,6.402438,2.830003,5.759859,4.46476,6.570611,3.283381,7.161434,...,57189.0,31353.0,152242.0,19040.0,52849.0,24390.0,171673.0,22828.0,59542.0,32841.0
36081,3.291504,8.23626,7.252809,7.155501,3.426381,8.398947,7.912991,7.862156,2.964427,9.023847,...,17974.0,22517.0,140264.0,13300.0,15812.0,19768.0,141710.0,13507.0,17629.0,20273.0
36085,7.969522,34.337877,16.94674,24.327775,7.393676,30.086556,25.140895,22.472013,7.615406,25.969105,...,1328.0,1717.0,22967.0,717.0,3972.0,1511.0,23637.0,1284.0,897.0,2842.0
