# Predictor Processing

In [1]:
# Load Dependencies
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load Custom Scripts
from src.utils.ipums_extract import (
    get_ipums_data,
    load_ipums_from_pkl,
)

## New Data Extract

In [None]:
# Define Parameters (do not push your API key to VCS)
API_KEY = "key"
DOWNLOAD_DIR = Path(r"data")
PKL_EXPORT = True
PKL_PATH = Path(r"data/ipums_extract.pkl")

collection = "ipumsi"
description = "data mining mozambique project"
samples = ["mz1997a", "mz2007a", "mz2017a"]

variables = ['PERSONS', 'GQ', 'URBAN', 'GEO1_MZ', 'GEO2_MZ', 'OWNERSHIP',
             'PHONE', 'AUTOS', 'ROOMS', 'HHTYPE', 'RESIDENT', 'FAMSIZE',
             'NCHILD', 'AGE', 'SEX', 'MARST', 'MORTMOT', 'MORTFAT',
             'NATIVITY', 'CITIZEN', 'BPL1_MZ', 'SCHOOL', 'LIT',
             'EDATTAIN', 'EMPSTAT', 'LABFORCE', 'MIGRATE1', 'MIGRATE5']

In [4]:
# Get IPUMS Data
ipums_df = get_ipums_data(
    collection=collection,
    description=description,
    samples=samples,
    variables=variables,
    api_key=API_KEY,
    download_dir=DOWNLOAD_DIR,
    pkl_export=PKL_EXPORT,
    pkl_path=PKL_PATH
)

Extract submitted to IPUMS. Extract ID: 9.
Waiting for extract to finish processing on IPUMS server...
Downloading extract to data ...


See the `ipums_conditions` attribute of this codebook for terms of use.
See the `ipums_citation` attribute of this codebook for the appropriate citation.


Extracting data from extract to DataFrame...
Shape of IPUMS Data Extract: (6283068, 39)
Updating DataFrame with labels...
Saving IPUMS DataFrame to data/ipums_extract.pkl ...
IPUMS dataset extraction complete.


In [5]:
# Load Data from PKL
ipums_df_pkl = load_ipums_from_pkl(PKL_PATH)

In [7]:
# Transform to fix NIU, unknown and other issues
var_dicts = ''
with open('var_dictionaries.txt', encoding='utf-8') as f:
    exec("var_dicts = " + f.read())

for v in var_dicts.keys():
    if v in ipums_df_pkl.columns:
        ipums_df_pkl[v] = ipums_df_pkl[v].map(lambda x: var_dicts[v].get(x, x))

In [8]:
# Remove Migration NIUS and unknown
mig1_data = ipums_df_pkl[~ipums_df_pkl['MIGRATE1'].isna()].copy()
mig5_data = ipums_df_pkl[~ipums_df_pkl['MIGRATE5'].isna()].copy()

# Make new variance for prediction
mig1_data['mig_provincial'] = mig1_data['MIGRATE1']
mig5_data['mig_provincial'] = mig5_data['MIGRATE5']

# Rename columns
mig1_data = mig1_data.drop(['MIGRATE1', 'MIGRATE5'], axis=1)
mig5_data = mig5_data.drop(['MIGRATE1', 'MIGRATE5'], axis=1)

In [9]:
print(mig1_data['mig_provincial'].value_counts(dropna=False))
print(mig5_data['mig_provincial'].value_counts(dropna=False))

mig_provincial
0.0    5860462
1.0      69067
Name: count, dtype: int64
mig_provincial
0.0    4746396
1.0     228173
Name: count, dtype: int64


In [10]:
# Drop Metadata Columns
metadata_cols = ['COUNTRY', 'SAMPLE', 'SERIAL', 'HHWT', 'PERNUM', 'PERWT']
mig1_data.drop(columns=metadata_cols, inplace=True)
mig5_data.drop(columns=metadata_cols, inplace=True)

In [None]:
# Drop Detailed Columns
detailed_cols = ['OWNERSHIPD', 'MARSTD', 'EDATTAIND', 'EMPSTATD',
                 'GEO1_MZ', 'GEO2_MZ']
mig1_data.drop(columns=detailed_cols, inplace=True)
mig5_data.drop(columns=detailed_cols, inplace=True)

## Predictor conversion

In [21]:
# YEAR: 3 variables, (0 or 1) for [1997, 2007, 2017]
# var name: YEAR_<YEAR> (ex: YEAR_1997)

# Create New Columns (both mig1 and mig5!)
mig1_data['YEAR_1997'] = np.where(mig1_data['YEAR'] == '1997', 1, 0)
mig1_data['YEAR_2007'] = np.where(mig1_data['YEAR'] == '2007', 1, 0)
mig1_data['YEAR_2017'] = np.where(mig1_data['YEAR'] == '2017', 1, 0)

mig5_data['YEAR_1997'] = np.where(mig5_data['YEAR'] == '1997', 1, 0)
mig5_data['YEAR_2007'] = np.where(mig5_data['YEAR'] == '2007', 1, 0)
mig5_data['YEAR_2017'] = np.where(mig5_data['YEAR'] == '2017', 1, 0)

# Drop Old Column
mig1_data.drop(columns=['YEAR'], inplace=True)
mig5_data.drop(columns=['YEAR'], inplace=True)

For binning, use `unique()` (i.e., `mig1_data['PERSONS'].unique()`) to identify
the unique values and always leave space (if necessary/reasonable) on top for
generality.

In [None]:
# PERSONS: binning (by 10s?) [should be about 5 variables]
# var name: PERSONS_<cutoff> (ex: 10 and below -> PERSONS_10)

# Create New Columns (both mig1 and mig5!)
mig1_data['PERSONS_10'] = np.where(mig1_data['PERSONS'] <= 10, 1, 0)                                      # PERSONS ≤ 10
mig1_data['PERSONS_20'] = np.where((mig1_data['PERSONS'] >= 11) & (mig1_data['PERSONS'] <= 20), 1, 0)     # 11 ≤ PERSONS ≤ 20
mig1_data['PERSONS_30'] = np.where((mig1_data['PERSONS'] >= 21) & (mig1_data['PERSONS'] <= 30), 1, 0)     # 21 ≤ PERSONS ≤ 30
mig1_data['PERSONS_40'] = np.where((mig1_data['PERSONS'] >= 31) & (mig1_data['PERSONS'] <= 40), 1, 0)     # 31 ≤ PERSONS ≤ 40
mig1_data['PERSONS_50'] = np.where((mig1_data['PERSONS'] >= 41) & (mig1_data['PERSONS'] <= 50), 1, 0)     # 41 ≤ PERSONS ≤ 50

mig5_data['PERSONS_10'] = np.where(mig5_data['PERSONS'] <= 10, 1, 0)                                      # PERSONS ≤ 10
mig5_data['PERSONS_20'] = np.where((mig5_data['PERSONS'] >= 11) & (mig5_data['PERSONS'] <= 20), 1, 0)     # 11 ≤ PERSONS ≤ 20
mig5_data['PERSONS_30'] = np.where((mig5_data['PERSONS'] >= 21) & (mig5_data['PERSONS'] <= 30), 1, 0)     # 21 ≤ PERSONS ≤ 30
mig5_data['PERSONS_40'] = np.where((mig5_data['PERSONS'] >= 31) & (mig5_data['PERSONS'] <= 40), 1, 0)     # 31 ≤ PERSONS ≤ 40
mig5_data['PERSONS_50'] = np.where((mig5_data['PERSONS'] >= 41) & (mig5_data['PERSONS'] <= 50), 1, 0)     # 41 ≤ PERSONS ≤ 50

# Drop Old Column
mig1_data.drop(columns=['PERSONS'], inplace=True)
mig5_data.drop(columns=['PERSONS'], inplace=True)

Special case for combining several categories. Once again, `unique()` will be
useful. Note that we use `|` for `or` just as we did above, using `&` for `and`.

In [31]:
# GQ: 3 variables for [Household, Institution, Other]
# var name: GQ_<type> (ex: GQ_HOUSEHOLD)

# Create New Columns (both mig1 and mig5!)
mig1_data['GQ_HOUSEHOLD'] = np.where(mig1_data['GQ'] == 'Households', 1, 0)
mig1_data['GQ_INSTITUTION'] = np.where(mig1_data['GQ'] == 'Institutions', 1, 0)
mig1_data['GQ_OTHER'] = np.where((mig1_data['GQ'] == '1-person unit created by splitting large household') |
                                 (mig1_data['GQ'] == 'Other group quarters'), 1, 0)

mig5_data['GQ_HOUSEHOLD'] = np.where(mig5_data['GQ'] == 'Households', 1, 0)
mig5_data['GQ_INSTITUTION'] = np.where(mig5_data['GQ'] == 'Institutions', 1, 0)
mig5_data['GQ_OTHER'] = np.where((mig5_data['GQ'] == '1-person unit created by splitting large household') |
                                 (mig5_data['GQ'] == 'Other group quarters'), 1, 0)

# Drop Old Column
mig1_data.drop(columns=['GQ'], inplace=True)
mig5_data.drop(columns=['GQ'], inplace=True)

Use `.fillna(0)` to convert `nan` to `0`. Then use `.astype(int)` to convert
to integers

In [35]:
# URBAN (already binary, convert to int)
mig1_data['URBAN']= mig1_data['URBAN'].fillna(0).astype(int)
mig5_data['URBAN'] = mig5_data['URBAN'].fillna(0).astype(int)

In [None]:
# OWNERSHIP (already binary, convert to int)

In [None]:
# PHONE (already binary, convert to int)

In [None]:
# AUTOS (already binary, convert to int)

In [None]:
# ROOMS: similar to PERSONS (bin by 5s)
# var name: ROOMS_<cutoff> (ex: 5 and below -> ROOMS_5)

In [None]:
# HHTYPE: combine as follows
#   HHTYPE_MARRIED: 'Married/cohab couple with children', 'Married/cohab couple, no children'
#   HHTYPE_SINGLE: 'Single-parent family', 'One-person household'
#   HHTYPE_OTHER: the remaining categories

In [None]:
# RESIDENT: 1 for present, 0 for absent (only conversion, no expansion needed)

In [None]:
# FAMSIZE: similar to PERSONS (bin by 5s)
# var name: FAMSIZE_<cutoff> (ex: 5 and below -> FAMSIZE_5)

In [None]:
# NCHILD: similar to PERSONS (bin by 2s)
# var name: NCHILD_<cutoff> (ex: 2 and below -> FAMSIZE_2)

In [None]:
# AGE: similar to PERSONS (bin by 10s) -> all the way to 100
# varname: AGE_<cutoff> (ex: 10 and below -> AGE_10)

In [None]:
# MARST: 4 variables
# categories:
#    - 'MARRIED': 'Married/in union'
#    - 'SINGLE': 'Single/never married'
#    - 'OTHER': 'Separated...' or 'Widowed'
# varname: MARST_<category> (ex: MARST_MARRIED)

In [None]:
# MORTMOT (already in binary, convert to int)

In [None]:
# MORTFAT (already in binary, convert to int)

In [None]:
# NATIVITY (already in binary, convert to int)

In [None]:
# CITIZEN (already in history, convert to int)

In [None]:
# BPL1_MZ: 2 variables
#   categories
#     - 'FOREIGN': 'Foreign Country'
#     - 'DOMESTIC': all others
#   var name: BP_<category> (ex: BP_FOREIGN)

In [None]:
# SCHOOL: convert to SCHOOL (Yes=1, No=0) (no expansion, just convert)

In [None]:
# LIT (already binary, convert to int)

In [None]:
# EDATTAIN: 
#   categories:
#    - 'NONE': 'Less than primary completed'
#    - 'PRIMARY': 'Primary completed'
#    - 'SECONDARY': 'Secondary completed'
#    - 'HIGHER': 'University completed'
# var name: EDU_<category> (ex: EDU_NONE)

In [None]:
# EMPSTAT: convert to EMPSTAT (Employed=1, Unemployed/Inactive=0)

In [None]:
# LABFORCE (already binary, convert to int)

In [None]:
# mig_provincial (rename to MIG, convert to int)