### Using IPUMS records, calculate in-migration and out-migration to/from NYC - NYC Metro and domestic U.S.

Requires download of csv extracts from IPUMS USA website
https://usa.ipums.org

For more information about specific variables available for download, refer to IPUMS-USA website, ex.:
https://usa.ipums.org/usa-action/variables/MIGRATE1#codes_section

In [1]:
import pandas as pd
import numpy as np
import math

#### Stat functions for using replicate weights

In [2]:
# functions to calculate standard error, moe, and coefficient of variation
def get_se(per_wt,rep_weights):
    result = math.sqrt((sum(map(lambda x: (x-per_wt)**2,rep_weights))/20))
    return result

def get_moe(se):
    return se*1.645 #90% confidence interval

def agg_moe(m):
    result = math.sqrt(sum(map(lambda x: x**2, m)))
    return result

def get_cv(est,m):
    if est == 0:
        return 0
    else:
        return (np.absolute(m/1.645/est))*100

In [3]:
# create a list of replicate weights
repwt = 'REPWTP'
repwts = [repwt+str(i) for i in range(1, 81)]

#### Geography look up files & cleanup

In [4]:
# pull in geography reference files
df = pd.read_csv('data/usa_00046.csv')
respuma_00 = pd.read_csv('data/respuma_xwalk_00.csv')
respuma_10 = pd.read_csv('data/respuma_xwalk_10.csv')

In [5]:
df.head()

Unnamed: 0,YEAR,MULTYEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,CLUSTER,STATEFIP,COUNTYFIP,PUMA,STRATA,GQ,HHINCOME,PERNUM,PERWT
0,2000,,200001,436195,,13.0,2000004361951,34,1,101,56,1,55600,1,11.0
1,2000,,200001,436195,,13.0,2000004361951,34,1,101,56,1,55600,2,11.0
2,2000,,200001,436195,,13.0,2000004361951,34,1,101,56,1,55600,3,8.0
3,2000,,200001,436195,,13.0,2000004361951,34,1,101,56,1,55600,4,9.0
4,2000,,200001,436195,,13.0,2000004361951,34,1,101,56,1,55600,5,8.0


In [6]:
respuma_00.GISMATCH= respuma_00.GISMATCH.apply(str)

In [7]:
# create a new id columns for merge with geo lookup tables
df['stpuma_id'] = df['STATEFIP'].apply(str) + \
                    df['PUMA'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

In [8]:
df.dtypes

YEAR           int64
MULTYEAR     float64
SAMPLE         int64
SERIAL         int64
CBSERIAL     float64
HHWT         float64
CLUSTER        int64
STATEFIP       int64
COUNTYFIP      int64
PUMA           int64
STRATA         int64
GQ             int64
HHINCOME       int64
PERNUM         int64
PERWT        float64
stpuma_id     object
dtype: object

In [9]:
#reduce to just primary householder to de-dupe records
df = df[df['PERNUM']==1]
df_00 = df[df.YEAR==2000]
df_10 = df[df.YEAR==2010]
df_18 = df[df.YEAR==2018]

In [10]:
df.head()

Unnamed: 0,YEAR,MULTYEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,CLUSTER,STATEFIP,COUNTYFIP,PUMA,STRATA,GQ,HHINCOME,PERNUM,PERWT,stpuma_id
0,2000,,200001,436195,,13.0,2000004361951,34,1,101,56,1,55600,1,11.0,3400101
5,2000,,200001,436196,,22.0,2000004361961,34,1,101,105,1,6750,1,17.0,3400101
6,2000,,200001,436198,,19.0,2000004361981,34,1,101,80,1,6500,1,14.0,3400101
7,2000,,200001,436200,,52.0,2000004362001,34,1,101,62,1,63500,1,52.0,3400101
12,2000,,200001,436202,,14.0,2000004362021,34,1,101,93,1,140700,1,22.0,3400101


In [11]:
reg_00 = df_00.merge(respuma_00,how='left',left_on='stpuma_id',right_on='GISMATCH')


In [12]:
reg_00['HHI_18'] = reg_00['HHINCOME'].apply(lambda x: x*1.45)

In [13]:
inc_sort = lambda x: '<$50k' if (x >= 0 and x<50000) else '$50k-$100k' if (x >= 50000 and x < 100000) else '>$100k' if x >= 100000 else 'NA'
reg_00['HHI_18_cat'] = reg_00['HHI_18'].apply(inc_sort)
reg_00 = reg_00[reg_00['HHINCOME'] != 9999999]

In [14]:
reg_00.head()

Unnamed: 0,YEAR,MULTYEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,CLUSTER,STATEFIP_x,COUNTYFIP,PUMA_x,...,StateName,STATEFIP_y,PUMA_y,GISJOIN,GISMATCH,CountyCode,CountyName,Subregion,HHI_18,HHI_18_cat
0,2000,,200001,436195,,13.0,2000004361951,34,1,101,...,,,,,,,,,80620.0,$50k-$100k
1,2000,,200001,436196,,22.0,2000004361961,34,1,101,...,,,,,,,,,9787.5,<$50k
2,2000,,200001,436198,,19.0,2000004361981,34,1,101,...,,,,,,,,,9425.0,<$50k
3,2000,,200001,436200,,52.0,2000004362001,34,1,101,...,,,,,,,,,92075.0,$50k-$100k
4,2000,,200001,436202,,14.0,2000004362021,34,1,101,...,,,,,,,,,204015.0,>$100k


In [15]:
test = pd.pivot_table(reg_00,values='HHWT',index='CountyCode',columns=['HHI_18_cat'],aggfunc=np.sum)

In [16]:
test.to_excel('test.xlsx')

In [None]:
# REDUCE DATA TABLE TO PEOPLE WHO HAVE MOVED TO NYC COUNTIES ONLY IN THE LAST YEAR

# Select only records for people who have moved in the last year
# MIGRATE1 == 2,3,4 ; other codes are people who haven't moved
mig_codes = [2,3,4]
df = df[df['MIGRATE1'].isin(mig_codes)]


# Select records for people who live in NYC currently
nyc = ['36005','36047','36061','36081','36085']
df = df[df['in_stco'].isin(nyc)]

#df.head()

In [None]:
# merge with geography look up tables 
df_in = df.merge(df_migpl,how='left',left_on='MIGPLAC1',right_on='migplac_id').merge(df_migpuma,how='left',left_on='migpuma_id',right_on='MIGPUMA_str')
#df_in.head()

In [None]:
# replace NaN values in aggregation columns with domestic/intl values
column_clean = ['Subregion5','Subregion7','County','CountyFIP']
for i in column_clean:
    df_in.loc[df_in[i].isnull(),i] = df_in['US_intl']

In [None]:
# reduce table to just counties and subregions of residence 1 year ago
cols = ['in_stco','County','CountyFIP','Subregion7','Subregion5','PERWT'] + repwts
df_in = df_in[cols]

# rename for clarity
df_in = df_in.rename(columns={'County':'out_co_name','CountyFIP':'out_stco',\
                              'Subregion5':'out_subreg_5','Subregion7':'out_subreg_7',\
                              'PERWT':'in_pop'})
    
df_in.head()

#### NYC total by Subregion of Origin (simplified) table

Select different summary columns in groupby function to choose counties or aggregation by borough of current residence.

In [None]:
dff_in = df_in.groupby('out_subreg_5').sum().reset_index()
#dff_in

In [None]:
# calculate standard error, margin of error, cv
# drop replicate weight columns
dff_in['in_se'] = dff_in.apply(lambda x: (get_se(x['in_pop'],x[repwts])),axis=1)
dff_in['in_moe'] = dff_in.apply(lambda x: (get_moe(x['in_se'])),axis=1)
dff_in['in_cv'] = dff_in.apply(lambda x: (get_cv(x['in_pop'],x['in_se'])),axis=1)

dff_in = dff_in.drop(columns=repwts) 
                            
dff_in

### NYC resident out-migration to NYC Metro and U.S.

##### update csv paths & keep aggregation geography consistent with previous for net flow table in following section

In [None]:
# pull in outflow table
dff = pd.read_csv('data/nyc_outflow_110620.csv')

In [None]:
# create new id columns for merge with geo lookup table
dff['in_respuma'] = dff['STATEFIP'].apply(str).apply(lambda x: '{0:0>3}'.format(x)) + \
                    dff['PUMA'].apply(str).apply(lambda x: '{0:0>5}'.format(x))

dff['out_stco'] = dff['MIGPLAC1'].apply(str).apply(lambda x: '{0:0>2}'.format(x)) + \
                   dff['MIGCOUNTY1'].apply(str).apply(lambda x: '{0:0>3}'.format(x))

In [None]:
# Select records only for people who lived in NYC one year ago (NYC out-migrants)
dff = dff[dff['out_stco'].isin(nyc)]

In [None]:
# merge with geography look up tables 
df_out = dff.merge(df_respuma,how='left',left_on='in_respuma',right_on='puma_id')

In [None]:
# replace NaN values in aggregation columns with US for non region
column_clean = ['NAME','Subregion','Subregion2','County','CountyFIPS']
for i in column_clean:
    df_out.loc[df_out[i].isnull(),i] = 'US'

In [None]:
# reduce table to just current counties and subregions of residence 
cols = ['County','CountyFIPS','Subregion','Subregion2','out_stco','PERWT'] + repwts
df_out = df_out[cols]

# rename for clarity
df_out = df_out.rename(columns={'County':'in_co_name','CountyFIPS':'in_stco',\
                              'Subregion':'in_subreg_7','Subregion2':'in_subreg_5',\
                              'PERWT':'out_pop'})
    
df_out.head()

#### NYC total outflow by Subregion Destination (simplified) table

Select different summary columns in groupby function to choose counties or aggregation by borough of previous residence.

In [None]:
dff_out = df_out.groupby('in_subreg_5').sum().reset_index()
#dff_out

In [None]:
# calculate standard error, margin of error, cv
# drop replicate weight columns
dff_out['out_se'] = dff_out.apply(lambda x: (get_se(x['out_pop'],x[repwts])),axis=1)
dff_out['out_moe'] = dff_out.apply(lambda x: (get_moe(x['out_se'])),axis=1)
dff_out['out_cv'] = dff_out.apply(lambda x: (get_cv(x['out_pop'],x['out_se'])),axis=1)

dff_out = dff_out.drop(columns=repwts) 
                            
dff_out

### Merge in & out to create net flow columns

In [None]:
df_net = pd.merge(dff_in,dff_out,how='left',left_on='out_subreg_5',right_on='in_subreg_5')
df_net = df_net.replace(np.nan,0)

In [None]:
df_net['net_pop'] = df_net['in_pop']-df_net['out_pop']
df_net['net_moe'] = df_net.apply(lambda x: (agg_moe(x[['in_moe','out_moe']])),axis=1)
df_net['net_cv'] = df_net.apply(lambda x: (get_cv_2(x['net_pop'],x['net_moe'])),axis=1)

In [None]:
df_net

In [None]:
df_net.to_excel('output/nyc_subregion_migration_1418.xlsx')