# Predictor Processing

In [77]:
# Load Dependencies
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

True

In [31]:
# Load Custom Scripts
from src.utils.ipums_extract import (
    get_ipums_data,
    load_ipums_from_pkl,
)

## New Data Extract

In [78]:
# Define Parameters (do not push your API key to VCS)
API_KEY = "key"
DOWNLOAD_DIR = Path(r"data")
PKL_EXPORT = True
PKL_PATH = Path(r"data/ipums_extract.pkl")

collection = "ipumsi"
description = "data mining mozambique project"
samples = ["mz1997a", "mz2007a", "mz2017a"]

variables = ['PERSONS', 'GQ', 'URBAN', 'GEO1_MZ', 'GEO2_MZ', 'OWNERSHIP',
             'PHONE', 'AUTOS', 'ROOMS', 'HHTYPE', 'RESIDENT', 'FAMSIZE',
             'NCHILD', 'AGE', 'SEX', 'MARST', 'MORTMOT', 'MORTFAT',
             'NATIVITY', 'CITIZEN', 'BPL1_MZ', 'SCHOOL', 'LIT',
             'EDATTAIN', 'EMPSTAT', 'LABFORCE', 'MIGRATE1', 'MIGRATE5']

In [13]:
# Get IPUMS Data
ipums_df = get_ipums_data(
    collection=collection,
    description=description,
    samples=samples,
    variables=variables,
    api_key=API_KEY,
    download_dir=DOWNLOAD_DIR,
    pkl_export=PKL_EXPORT,
    pkl_path=PKL_PATH
)

Extract submitted to IPUMS. Extract ID: 6.
Waiting for extract to finish processing on IPUMS server...


KeyboardInterrupt: 

In [33]:
# Load Data from PKL
ipums_df_pkl = load_ipums_from_pkl(PKL_PATH)

In [34]:
# Transform to fix NIU, unknown and other issues
var_dicts = ''
with open('var_dictionaries.txt', encoding='utf-8') as f:
    exec("var_dicts = " + f.read())

for v in var_dicts.keys():
    if v in ipums_df_pkl.columns:
        ipums_df_pkl[v] = ipums_df_pkl[v].map(lambda x: var_dicts[v].get(x, x))

In [36]:
# Remove Migration NIUS and unknown
mig1_data = ipums_df_pkl[~ipums_df_pkl['MIGRATE1'].isna()].copy()
mig5_data = ipums_df_pkl[~ipums_df_pkl['MIGRATE5'].isna()].copy()

# Make new variance for prediction
mig1_data['mig_provincial'] = mig1_data['MIGRATE1']
mig5_data['mig_provincial'] = mig5_data['MIGRATE5']

# Rename columns
mig1_data = mig1_data.drop(['MIGRATE1', 'MIGRATE5'], axis=1)
mig5_data = mig5_data.drop(['MIGRATE1', 'MIGRATE5'], axis=1)

In [37]:
print(mig1_data['mig_provincial'].value_counts(dropna=False))
print(mig5_data['mig_provincial'].value_counts(dropna=False))

mig_provincial
0.0    5860462
1.0      69067
Name: count, dtype: int64
mig_provincial
0.0    4746396
1.0     228173
Name: count, dtype: int64


In [38]:
# Drop Metadata Columns
metadata_cols = ['COUNTRY', 'SAMPLE', 'SERIAL', 'HHWT', 'PERNUM', 'PERWT']
mig1_data.drop(columns=metadata_cols, inplace=True)
mig5_data.drop(columns=metadata_cols, inplace=True)

In [39]:
# Drop Detailed Columns
detailed_cols = ['OWNERSHIPD', 'MARSTD', 'EDATTAIND', 'EMPSTATD',
                 'GEO1_MZ', 'GEO2_MZ']
mig1_data.drop(columns=detailed_cols, inplace=True)
mig5_data.drop(columns=detailed_cols, inplace=True)

## Predictor conversion

In [40]:
# YEAR: 3 variables, (0 or 1) for [1997, 2007, 2017]
# var name: YEAR_<YEAR> (ex: YEAR_1997)

# Create New Columns (both mig1 and mig5!)
mig1_data['YEAR_1997'] = np.where(mig1_data['YEAR'] == '1997', 1, 0)
mig1_data['YEAR_2007'] = np.where(mig1_data['YEAR'] == '2007', 1, 0)
mig1_data['YEAR_2017'] = np.where(mig1_data['YEAR'] == '2017', 1, 0)

mig5_data['YEAR_1997'] = np.where(mig5_data['YEAR'] == '1997', 1, 0)
mig5_data['YEAR_2007'] = np.where(mig5_data['YEAR'] == '2007', 1, 0)
mig5_data['YEAR_2017'] = np.where(mig5_data['YEAR'] == '2017', 1, 0)

# Drop Old Column
mig1_data.drop(columns=['YEAR'], inplace=True)
mig5_data.drop(columns=['YEAR'], inplace=True)

For binning, use `unique()` (i.e., `mig1_data['PERSONS'].unique()`) to identify
the unique values and always leave space (if necessary/reasonable) on top for
generality.

In [41]:
# PERSONS: binning (by 10s?) [should be about 5 variables]
# var name: PERSONS_<cutoff> (ex: 10 and below -> PERSONS_10)

# Create New Columns (both mig1 and mig5!)
mig1_data['PERSONS_10'] = np.where(mig1_data['PERSONS'] <= 10, 1, 0)                                      # PERSONS ≤ 10
mig1_data['PERSONS_20'] = np.where((mig1_data['PERSONS'] >= 11) & (mig1_data['PERSONS'] <= 20), 1, 0)     # 11 ≤ PERSONS ≤ 20
mig1_data['PERSONS_30'] = np.where((mig1_data['PERSONS'] >= 21) & (mig1_data['PERSONS'] <= 30), 1, 0)     # 21 ≤ PERSONS ≤ 30
mig1_data['PERSONS_40'] = np.where((mig1_data['PERSONS'] >= 31) & (mig1_data['PERSONS'] <= 40), 1, 0)     # 31 ≤ PERSONS ≤ 40
mig1_data['PERSONS_50'] = np.where((mig1_data['PERSONS'] >= 41) & (mig1_data['PERSONS'] <= 50), 1, 0)     # 41 ≤ PERSONS ≤ 50

mig5_data['PERSONS_10'] = np.where(mig5_data['PERSONS'] <= 10, 1, 0)                                      # PERSONS ≤ 10
mig5_data['PERSONS_20'] = np.where((mig5_data['PERSONS'] >= 11) & (mig5_data['PERSONS'] <= 20), 1, 0)     # 11 ≤ PERSONS ≤ 20
mig5_data['PERSONS_30'] = np.where((mig5_data['PERSONS'] >= 21) & (mig5_data['PERSONS'] <= 30), 1, 0)     # 21 ≤ PERSONS ≤ 30
mig5_data['PERSONS_40'] = np.where((mig5_data['PERSONS'] >= 31) & (mig5_data['PERSONS'] <= 40), 1, 0)     # 31 ≤ PERSONS ≤ 40
mig5_data['PERSONS_50'] = np.where((mig5_data['PERSONS'] >= 41) & (mig5_data['PERSONS'] <= 50), 1, 0)     # 41 ≤ PERSONS ≤ 50

# Drop Old Column
mig1_data.drop(columns=['PERSONS'], inplace=True)
mig5_data.drop(columns=['PERSONS'], inplace=True)

Special case for combining several categories. Once again, `unique()` will be
useful. Note that we use `|` for `or` just as we did above, using `&` for `and`.

In [42]:
# GQ: 3 variables for [Household, Institution, Other]
# var name: GQ_<type> (ex: GQ_HOUSEHOLD)

# Create New Columns (both mig1 and mig5!)
mig1_data['GQ_HOUSEHOLD'] = np.where(mig1_data['GQ'] == 'Households', 1, 0)
mig1_data['GQ_INSTITUTION'] = np.where(mig1_data['GQ'] == 'Institutions', 1, 0)
mig1_data['GQ_OTHER'] = np.where((mig1_data['GQ'] == '1-person unit created by splitting large household') |
                                 (mig1_data['GQ'] == 'Other group quarters'), 1, 0)

mig5_data['GQ_HOUSEHOLD'] = np.where(mig5_data['GQ'] == 'Households', 1, 0)
mig5_data['GQ_INSTITUTION'] = np.where(mig5_data['GQ'] == 'Institutions', 1, 0)
mig5_data['GQ_OTHER'] = np.where((mig5_data['GQ'] == '1-person unit created by splitting large household') |
                                 (mig5_data['GQ'] == 'Other group quarters'), 1, 0)

# Drop Old Column
mig1_data.drop(columns=['GQ'], inplace=True)
mig5_data.drop(columns=['GQ'], inplace=True)

Use `.fillna(0)` to convert `nan` to `0`. Then use `.astype(int)` to convert
to integers

In [43]:
# URBAN (already binary, convert to int)
mig1_data['URBAN']= mig1_data['URBAN'].fillna(0).astype(int)
mig5_data['URBAN'] = mig5_data['URBAN'].fillna(0).astype(int)

In [44]:
# OWNERSHIP (already binary, convert to int)
mig1_data['OWNERSHIP'] = mig1_data['OWNERSHIP'].fillna(0).astype(int)
mig5_data['OWNERSHIP'] = mig5_data['OWNERSHIP'].fillna(0).astype(int)

In [45]:
# PHONE (already binary, convert to int)
mig1_data['PHONE'] = mig1_data['PHONE'].fillna(0).astype(int)
mig5_data['PHONE'] = mig5_data['PHONE'].fillna(0).astype(int)

In [46]:
# AUTOS (already binary, convert to int)
mig1_data['AUTOS'] = mig1_data['AUTOS'].fillna(0).astype(int)
mig5_data['AUTOS'] = mig5_data['AUTOS'].fillna(0).astype(int)

In [47]:
# ROOMS: similar to PERSONS (bin by 5s)
# var name: ROOMS_<cutoff> (ex: 5 and below -> ROOMS_5)

# Create New mig1 column 
mig1_data['ROOMS_5'] = np.where(mig1_data['ROOMS'] <= 5, 1, 0)                                      # ROOMS ≤ 5
mig1_data['ROOMS_10'] = np.where((mig1_data['ROOMS'] >= 6) & (mig1_data['ROOMS'] <= 10), 1, 0)     # 6 ≤ ROOMS ≤ 10

#might be redundant, not all of the data sets go this high
mig1_data['ROOMS_15'] = np.where((mig1_data['ROOMS'] >= 11) & (mig1_data['ROOMS'] <= 15), 1, 0)     # 11 ≤ ROOMS ≤ 15
mig1_data['ROOMS_20'] = np.where((mig1_data['ROOMS'] >= 11) & (mig1_data['ROOMS'] <= 15), 1, 0)     # 16 ≤ ROOMS ≤ 20



#Create new mig5 column
mig5_data['ROOMS_5'] = np.where(mig5_data['ROOMS'] <= 5, 1, 0)                                      # ROOMS ≤ 5
mig5_data['ROOMS_10'] = np.where((mig5_data['ROOMS'] >= 6) & (mig5_data['ROOMS'] <= 10), 1, 0)     # 6 ≤ ROOMS ≤ 10

#might be redundant, not all of the data sets go this high
mig5_data['ROOMS_15'] = np.where((mig5_data['ROOMS'] >= 11) & (mig5_data['ROOMS'] <= 15), 1, 0)     # 11 ≤ ROOMS ≤ 15
mig5_data['ROOMS_20'] = np.where((mig5_data['ROOMS'] >= 11) & (mig5_data['ROOMS'] <= 15), 1, 0)     # 16 ≤ ROOMS ≤ 20



# Drop Old Column
mig1_data.drop(columns=['ROOMS'], inplace=True)
mig5_data.drop(columns=['ROOMS'], inplace=True)

In [60]:
# HHTYPE: combine as follows
#   HHTYPE_MARRIED: 'Married/cohab couple with children', 'Married/cohab couple, no children'
#   HHTYPE_SINGLE: 'Single-parent family', 'One-person household'
#   HHTYPE_OTHER: the remaining categories

# defining data into easier variables to use
mig1_married = mig1_data['HHTYPE'].isin([
    'Married/cohab couple with children',
    'Married/cohab couple, no children'
])
mig1_single = mig1_data['HHTYPE'].isin([
    'Single-parent family',
    'One-person household'
])

mig5_married = mig5_data['HHTYPE'].isin([
    'Married/cohab couple with children',
    'Married/cohab couple, no children'
])
mig5_single = mig5_data['HHTYPE'].isin([
    'Single-parent family',
    'One-person household'
])


# Create New Columns (both mig1 and mig5!)
mig1_data['HHTYPE_MARRIED'] = np.where(mig1_married, 1, 0) # HHTYPE_MARRIED: 'Married/cohab couple with children', 'Married/cohab couple, no children'
mig1_data['HHTYPE_SINGLE'] = np.where(mig1_single, 1, 0) # HHTYPE_SINGLE: 'Single-parent family', 'One-person household'
mig1_data['HHTYPE_OTHER'] = np.where(~(mig1_married | mig1_single), 1, 0) # HHTYPE_OTHER: the remaining categories

mig5_data['HHTYPE_MARRIED'] = np.where(mig5_married, 1, 0) # HHTYPE_MARRIED: 'Married/cohab couple with children', 'Married/cohab couple, no children'
mig5_data['HHTYPE_SINGLE'] = np.where(mig5_single, 1, 0) # HHTYPE_SINGLE: 'Single-parent family', 'One-person household'
mig5_data['HHTYPE_OTHER'] = np.where(~(mig5_married | mig5_single), 1, 0) # HHTYPE_OTHER: the remaining categories


# Drop Old Column
mig1_data.drop(columns=['HHTYPE'], inplace=True)
mig5_data.drop(columns=['HHTYPE'], inplace=True)

In [62]:
# RESIDENT: 1 for present, 0 for absent (only conversion, no expansion needed)
mig1_data['RESIDENT'] = np.where(
    (mig1_data['RESIDENT'] == 'Present resident') |
    (mig1_data['RESIDENT'] == 'Absent resident'), 1, 0 )

mig5_data['RESIDENT'] = np.where(
    (mig5_data['RESIDENT'] == 'Present resident') |
    (mig5_data['RESIDENT'] == 'Absent resident'), 1, 0 )

mig1_data.drop(columns=['RESIDENT'], inplace=True)
mig5_data.drop(columns=['RESIDENT'], inplace=True)

In [61]:
# FAMSIZE: similar to PERSONS (bin by 5s)
# var name: FAMSIZE_<cutoff> (ex: 5 and below -> FAMSIZE_5)

# Create New mig1 column 
mig1_data['FAMSIZE_5'] = np.where(mig1_data['FAMSIZE'] <= 5, 1, 0)                                      # FAMSIZE ≤ 5
mig1_data['FAMSIZE_10'] = np.where((mig1_data['FAMSIZE'] >= 6) & (mig1_data['FAMSIZE'] <= 10), 1, 0)     # 6 ≤ FAMSIZE ≤ 10
mig1_data['FAMSIZE_15'] = np.where((mig1_data['FAMSIZE'] >= 11) & (mig1_data['FAMSIZE'] <= 15), 1, 0)     # 11 ≤ FAMSIZE ≤ 15
mig1_data['FAMSIZE_20'] = np.where((mig1_data['FAMSIZE'] >= 16) & (mig1_data['FAMSIZE'] <= 20), 1, 0)     # 16 ≤ FAMSIZE ≤ 20
mig1_data['FAMSIZE_25'] = np.where((mig1_data['FAMSIZE'] >= 21) & (mig1_data['FAMSIZE'] <= 25), 1, 0)     # 21 ≤ FAMSIZE ≤ 25
mig1_data['FAMSIZE_30'] = np.where((mig1_data['FAMSIZE'] >= 26) & (mig1_data['FAMSIZE'] <= 30), 1, 0)     # 26 ≤ FAMSIZE ≤ 30
mig1_data['FAMSIZE_35'] = np.where((mig1_data['FAMSIZE'] >= 31) & (mig1_data['FAMSIZE'] <= 35), 1, 0)     # 31 ≤ FAMSIZE ≤ 35

#might be redundant, not all of the data sets go this high
mig1_data['FAMSIZE_40'] = np.where((mig1_data['FAMSIZE'] >= 36) & (mig1_data['FAMSIZE'] <= 40), 1, 0)     # 36 ≤ FAMSIZE ≤ 40
mig1_data['FAMSIZE_45'] = np.where((mig1_data['FAMSIZE'] >= 41) & (mig1_data['FAMSIZE'] <= 45), 1, 0)     # 41 ≤ FAMSIZE ≤ 45

# new mig5 columns
mig5_data['FAMSIZE_5'] = np.where(mig5_data['FAMSIZE'] <= 5, 1, 0)                                      # FAMSIZE ≤ 5
mig5_data['FAMSIZE_10'] = np.where((mig5_data['FAMSIZE'] >= 6) & (mig5_data['FAMSIZE'] <= 10), 1, 0)     # 6 ≤ FAMSIZE ≤ 10
mig5_data['FAMSIZE_15'] = np.where((mig5_data['FAMSIZE'] >= 11) & (mig5_data['FAMSIZE'] <= 15), 1, 0)     # 11 ≤ FAMSIZE ≤ 15
mig5_data['FAMSIZE_20'] = np.where((mig5_data['FAMSIZE'] >= 16) & (mig5_data['FAMSIZE'] <= 20), 1, 0)     # 16 ≤ FAMSIZE ≤ 20
mig5_data['FAMSIZE_25'] = np.where((mig5_data['FAMSIZE'] >= 21) & (mig5_data['FAMSIZE'] <= 25), 1, 0)     # 21 ≤ FAMSIZE ≤ 25
mig5_data['FAMSIZE_30'] = np.where((mig5_data['FAMSIZE'] >= 26) & (mig5_data['FAMSIZE'] <= 30), 1, 0)     # 26 ≤ FAMSIZE ≤ 30
mig5_data['FAMSIZE_35'] = np.where((mig5_data['FAMSIZE'] >= 31) & (mig5_data['FAMSIZE'] <= 35), 1, 0)     # 31 ≤ FAMSIZE ≤ 35

#might be redundant, not all of the data sets go this high
mig5_data['FAMSIZE_40'] = np.where((mig5_data['FAMSIZE'] >= 36) & (mig5_data['FAMSIZE'] <= 40), 1, 0)     # 36 ≤ FAMSIZE ≤ 40
mig5_data['FAMSIZE_45'] = np.where((mig5_data['FAMSIZE'] >= 41) & (mig5_data['FAMSIZE'] <= 45), 1, 0)     # 41 ≤ FAMSIZE ≤ 45


# Drop Old Column
mig1_data.drop(columns=['FAMSIZE'], inplace=True)
mig5_data.drop(columns=['FAMSIZE'], inplace=True)

In [63]:
# NCHILD: similar to PERSONS (bin by 2s)
# var name: NCHILD_<cutoff> (ex: 2 and below -> NCHILD_2)

# Create New mig1 column 
mig1_data['NCHILD_2'] = np.where(mig1_data['NCHILD'] <= 2, 1, 0)                                      # NCHILD ≤ 2
mig1_data['NCHILD_4'] = np.where((mig1_data['NCHILD'] >= 3) & (mig1_data['NCHILD'] <= 4), 1, 0)     # 3 ≤ NCHILD ≤ 4
mig1_data['NCHILD_6'] = np.where((mig1_data['NCHILD'] >= 5) & (mig1_data['NCHILD'] <= 6), 1, 0)     # 5 ≤ NCHILD ≤ 6
mig1_data['NCHILD_8'] = np.where((mig1_data['NCHILD'] >= 7) & (mig1_data['NCHILD'] <= 8), 1, 0)     # 7 ≤ NCHILD ≤ 8
mig1_data['NCHILD_10'] = np.where((mig1_data['NCHILD'] >= 9) & (mig1_data['NCHILD'] <= 10), 1, 0)     # 9 ≤ NCHILD ≤ 10


#Create new mig5 column

mig5_data['NCHILD_2'] = np.where(mig5_data['NCHILD'] <= 2, 1, 0)                                      # NCHILD ≤ 2
mig5_data['NCHILD_4'] = np.where((mig5_data['NCHILD'] >= 3) & (mig5_data['NCHILD'] <= 4), 1, 0)     # 3 ≤ NCHILD ≤ 4
mig5_data['NCHILD_6'] = np.where((mig5_data['NCHILD'] >= 5) & (mig5_data['NCHILD'] <= 6), 1, 0)     # 5 ≤ NCHILD ≤ 6
mig5_data['NCHILD_8'] = np.where((mig5_data['NCHILD'] >= 7) & (mig5_data['NCHILD'] <= 8), 1, 0)     # 7 ≤ NCHILD ≤ 8
mig5_data['NCHILD_10'] = np.where((mig5_data['NCHILD'] >= 9) & (mig5_data['NCHILD'] <= 10), 1, 0)     # 9 ≤ NCHILD ≤ 10


# Drop Old Column
mig1_data.drop(columns=['NCHILD'], inplace=True)
mig5_data.drop(columns=['NCHILD'], inplace=True)

In [64]:
# AGE: similar to PERSONS (bin by 10s) -> all the way to 100
# varname: AGE_<cutoff> (ex: 10 and below -> AGE_10)

# Create New mig1 column 
mig1_data['AGE_10'] = np.where(mig1_data['AGE'] <= 10, 1, 0)                                      # AGE ≤ 10
mig1_data['AGE_20'] = np.where((mig1_data['AGE'] >= 11) & (mig1_data['AGE'] <= 20), 1, 0)     # 11 ≤ AGE ≤ 20
mig1_data['AGE_30'] = np.where((mig1_data['AGE'] >= 21) & (mig1_data['AGE'] <= 30), 1, 0)     # 21 ≤ AGE ≤ 30
mig1_data['AGE_40'] = np.where((mig1_data['AGE'] >= 31) & (mig1_data['AGE'] <= 40), 1, 0)     # 31 ≤ AGE ≤ 40
mig1_data['AGE_50'] = np.where((mig1_data['AGE'] >= 41) & (mig1_data['AGE'] <= 50), 1, 0)     # 41 ≤ AGE ≤ 50
mig1_data['AGE_60'] = np.where((mig1_data['AGE'] >= 51) & (mig1_data['AGE'] <= 60), 1, 0)     # 51 ≤ AGE ≤ 60
mig1_data['AGE_70'] = np.where((mig1_data['AGE'] >= 61) & (mig1_data['AGE'] <= 70), 1, 0)     # 61 ≤ AGE ≤ 70
mig1_data['AGE_80'] = np.where((mig1_data['AGE'] >= 71) & (mig1_data['AGE'] <= 80), 1, 0)     # 71 ≤ AGE ≤ 80
mig1_data['AGE_90'] = np.where((mig1_data['AGE'] >= 81) & (mig1_data['AGE'] <= 90), 1, 0)     # 81 ≤ AGE ≤ 90
mig1_data['AGE_100'] = np.where((mig1_data['AGE'] >= 91) & (mig1_data['AGE'] <= 100), 1, 0)     # 91 ≤ AGE ≤ 100

# Create New mig5 column 
mig5_data['AGE_10'] = np.where(mig5_data['AGE'] <= 10, 1, 0)                                      # AGE ≤ 10
mig5_data['AGE_20'] = np.where((mig5_data['AGE'] >= 11) & (mig5_data['AGE'] <= 20), 1, 0)     # 11 ≤ AGE ≤ 20
mig5_data['AGE_30'] = np.where((mig5_data['AGE'] >= 21) & (mig5_data['AGE'] <= 30), 1, 0)     # 21 ≤ AGE ≤ 30
mig5_data['AGE_40'] = np.where((mig5_data['AGE'] >= 31) & (mig5_data['AGE'] <= 40), 1, 0)     # 31 ≤ AGE ≤ 40
mig5_data['AGE_50'] = np.where((mig5_data['AGE'] >= 41) & (mig5_data['AGE'] <= 50), 1, 0)     # 41 ≤ AGE ≤ 50
mig5_data['AGE_60'] = np.where((mig5_data['AGE'] >= 51) & (mig5_data['AGE'] <= 60), 1, 0)     # 51 ≤ AGE ≤ 60
mig5_data['AGE_70'] = np.where((mig5_data['AGE'] >= 61) & (mig5_data['AGE'] <= 70), 1, 0)     # 61 ≤ AGE ≤ 70
mig5_data['AGE_80'] = np.where((mig5_data['AGE'] >= 71) & (mig5_data['AGE'] <= 80), 1, 0)     # 71 ≤ AGE ≤ 80
mig5_data['AGE_90'] = np.where((mig5_data['AGE'] >= 81) & (mig5_data['AGE'] <= 90), 1, 0)     # 81 ≤ AGE ≤ 90
mig5_data['AGE_100'] = np.where((mig5_data['AGE'] >= 91) & (mig5_data['AGE'] <= 100), 1, 0)     # 91 ≤ AGE ≤ 100

# Drop Old Column
mig1_data.drop(columns=['AGE'], inplace=True)
mig5_data.drop(columns=['AGE'], inplace=True)


In [67]:
# MARST: 4 variables
# categories:
#    - 'MARRIED': 'Married/in union'
#    - 'SINGLE': 'Single/never married'
#    - 'OTHER': 'Separated...' or 'Widowed'
# varname: MARST_<category> (ex: MARST_MARRIED)

# variables for each group on mig1_data
mig1_married = mig1_data['MARST'] == 'Married/in union'
mig1_single = mig1_data['MARST'] == 'Single/never married'
mig1_other = (mig1_data['MARST'] == 'Separated/divorced/spouse absent') | (mig1_data['MARST'] == 'Widowed')

#  same for mig5_data
mig5_married = mig5_data['MARST'] == 'Married/in union'
mig5_single = mig5_data['MARST'] == 'Single/never married'
mig5_other = (mig5_data['MARST'] == 'Separated/divorced/spouse absent') | (mig5_data['MARST'] == 'Widowed')

# Create New Columns (both mig1 and mig5!)
mig1_data['MARST_MARRIED'] = np.where(mig1_married, 1, 0)
mig1_data['MARST_SINGLE'] = np.where(mig1_single, 1, 0)
mig1_data['MARST_OTHER'] = np.where(mig1_other, 1, 0)

mig5_data['MARST_MARRIED'] = np.where(mig5_married, 1, 0)
mig5_data['MARST_SINGLE'] = np.where(mig5_single, 1, 0)
mig5_data['MARST_OTHER'] = np.where(mig5_other, 1, 0)

# Drop Old Column
mig1_data.drop(columns=['MARST'], inplace=True)
mig5_data.drop(columns=['MARST'], inplace=True)

In [66]:
# MORTMOT (already in binary, convert to int)
mig1_data['MORTMOT'] = mig1_data['MORTMOT'].fillna(0).astype(int)
mig5_data['MORTMOT'] = mig5_data['MORTMOT'].fillna(0).astype(int)

In [50]:
# MORTFAT (already in binary, convert to int)
mig1_data['MORTFAT'] = mig1_data['MORTFAT'].fillna(0).astype(int)
mig5_data['MORTFAT'] = mig5_data['MORTFAT'].fillna(0).astype(int)

In [51]:
# NATIVITY (already in binary, convert to int)
mig1_data['NATIVITY'] = mig1_data['NATIVITY'].fillna(0).astype(int)
mig5_data['NATIVITY'] = mig5_data['NATIVITY'].fillna(0).astype(int)

In [52]:
# CITIZEN (already in history, convert to int)
mig1_data['CITIZEN'] = mig1_data['CITIZEN'].fillna(0).astype(int)
mig5_data['CITIZEN'] = mig5_data['CITIZEN'].fillna(0).astype(int)

In [68]:
# BPL1_MZ: 2 variables
#   categories
#     - 'FOREIGN': 'Foreign Country'
#     - 'DOMESTIC': all others
#   var name: BP_<category> (ex: BP_FOREIGN)

# variables on mig1_data
mig1_foreign = mig1_data['BPL1_MZ'] == 'Foreign Country'
mig1_domestic = mig1_data['BPL1_MZ'] != 'Foreign Country'

#mig5_data
mig5_foreign = mig5_data['BPL1_MZ'] == 'Foreign Country'
mig5_domestic = mig5_data['BPL1_MZ'] != 'Foreign Country'

# Create New Columns (both mig1 and mig5!)
mig1_data['BP_FOREIGN'] = np.where(mig1_foreign, 1, 0)
mig1_data['BP_DOMESTIC'] = np.where(mig1_domestic, 1, 0)

mig5_data['BP_FOREIGN'] = np.where(mig5_foreign, 1, 0)
mig5_data['BP_DOMESTIC'] = np.where(mig5_domestic, 1, 0)

# Drop Old Column if desired
mig1_data.drop(columns=['BPL1_MZ'], inplace=True)
mig5_data.drop(columns=['BPL1_MZ'], inplace=True)

In [72]:
# SCHOOL: convert to SCHOOL (Yes=1, No=0) (no expansion, just convert)
mig1_data['SCHOOL'] = np.where(mig1_data['SCHOOL'] == 'Yes', 1, 0)
mig5_data['SCHOOL'] = np.where(mig5_data['SCHOOL'] == 'Yes', 1, 0)


mig1_data.drop(columns=['SCHOOL'], inplace=True)
mig5_data.drop(columns=['SCHOOL'], inplace=True)

In [53]:
# LIT (already binary, convert to int)
mig1_data['LIT'] = mig1_data['LIT'].fillna(0).astype(int)
mig5_data['LIT'] = mig5_data['LIT'].fillna(0).astype(int)

In [69]:
# EDATTAIN: 
#   categories:
#    - 'NONE': 'Less than primary completed'
#    - 'PRIMARY': 'Primary completed'
#    - 'SECONDARY': 'Secondary completed'
#    - 'HIGHER': 'University completed'
# var name: EDU_<category> (ex: EDU_NONE)

# mig1_data
mig1_none = mig1_data['EDATTAIN'] == 'Less than primary completed'
mig1_primary = mig1_data['EDATTAIN'] == 'Primary completed'
mig1_secondary = mig1_data['EDATTAIN'] == 'Secondary completed'
mig1_higher = mig1_data['EDATTAIN'] == 'University completed'

# mig5_data
mig5_none = mig5_data['EDATTAIN'] == 'Less than primary completed'
mig5_primary = mig5_data['EDATTAIN'] == 'Primary completed'
mig5_secondary = mig5_data['EDATTAIN'] == 'Secondary completed'
mig5_higher = mig5_data['EDATTAIN'] == 'University completed'

# Create New Columns (both mig1 and mig5!)
mig1_data['EDU_NONE'] = np.where(mig1_none, 1, 0)
mig1_data['EDU_PRIMARY'] = np.where(mig1_primary, 1, 0)
mig1_data['EDU_SECONDARY'] = np.where(mig1_secondary, 1, 0)
mig1_data['EDU_HIGHER'] = np.where(mig1_higher, 1, 0)

mig5_data['EDU_NONE'] = np.where(mig5_none, 1, 0)
mig5_data['EDU_PRIMARY'] = np.where(mig5_primary, 1, 0)
mig5_data['EDU_SECONDARY'] = np.where(mig5_secondary, 1, 0)
mig5_data['EDU_HIGHER'] = np.where(mig5_higher, 1, 0)

# Drop Old Column if desired
mig1_data.drop(columns=['EDATTAIN'], inplace=True)
mig5_data.drop(columns=['EDATTAIN'], inplace=True)

In [71]:
# EMPSTAT: convert to EMPSTAT (Employed=1, Unemployed/Inactive=0)
mig1_data['EMPSTAT'] = np.where(
    mig1_data['EMPSTAT'] == 'Employed', 1, 0
)
mig5_data['EMPSTAT'] = np.where(
    mig5_data['EMPSTAT'] == 'Employed', 1, 0
)

mig1_data.drop(columns=['EMPSTAT'], inplace=True)
mig5_data.drop(columns=['EMPSTAT'], inplace=True)

In [55]:
# LABFORCE (already binary, convert to int)
mig1_data['LABFORCE'] = mig1_data['LABFORCE'].fillna(0).astype(int)
mig5_data['LABFORCE'] = mig5_data['LABFORCE'].fillna(0).astype(int)

In [57]:
# mig_provincial (rename to MIG, convert to int)
mig1_data['MIG'] = mig1_data['mig_provincial'].fillna(0).astype(int)
mig5_data['MIG'] = mig5_data['mig_provincial'].fillna(0).astype(int)